#Importing the input files
#Inspect Data Frames
#Null Value Calculation
#Analyze & Delete Unnecessary Columns in applicationDF
#Analyze & Delete Unnecessary Columns in previousDF
#Standardize Values
#Data Type Conversion
#Null Value Data Imputation
#Identifying the outliers
#Imbalance Analysis
#Plotting Functions
#Categorical Variables Analysis
#Numeric Variables Analysis
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style
import itertools
## ignore warnings
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
## adjust jupiter view
pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',500)
pd.set_option('display.width',1000)
pd.set_option('display.expand_frame_repr', False)
#3.Reading & Understanding the data
## import data-1
bank_data = pd.read_csv(r"C:\Users\THOTA AKHIL\Downloads\PROJECTS\RESUME PROJECTS\BANK PROJECT\19th BANK PROJECT\application_data.csv")
bank_data
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | FONDKAPREMONT_MODE | HOUSETYPE_MODE | TOTALAREA_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.018801 | -9461 | -637 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Laborers | 1.0 | 2 | 2 | WEDNESDAY | 10 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | 0.9722 | 0.6341 | 0.0144 | 0.0000 | 0.0690 | 0.0833 | 0.1250 | 0.0377 | 0.0220 | 0.0198 | 0.0 | 0.0000 | 0.0250 | 0.0369 | 0.9722 | 0.6243 | 0.0144 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0375 | 0.0205 | 0.0193 | 0.0000 | 0.0000 | reg oper account | block of flats | 0.0149 | Stone, brick | No | 2.0 | 2.0 | 2.0 | 2.0 | -1134.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | Family | State servant | Higher education | Married | House / apartment | 0.003541 | -16765 | -1188 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Core staff | 2.0 | 1 | 1 | MONDAY | 11 | 0 | 0 | 0 | 0 | 0 | 0 | School | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | 0.9851 | 0.8040 | 0.0497 | 0.0806 | 0.0345 | 0.2917 | 0.3333 | 0.0128 | 0.0790 | 0.0554 | 0.0 | 0.0000 | 0.0968 | 0.0529 | 0.9851 | 0.7987 | 0.0608 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0132 | 0.0787 | 0.0558 | 0.0039 | 0.0100 | reg oper account | block of flats | 0.0714 | Block | No | 1.0 | 0.0 | 1.0 | 0.0 | -828.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | 135000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.010032 | -19046 | -225 | -4260.0 | -2531 | 26.0 | 1 | 1 | 1 | 1 | 1 | 0 | Laborers | 1.0 | 2 | 2 | MONDAY | 9 | 0 | 0 | 0 | 0 | 0 | 0 | Government | NaN | 0.555912 | 0.729567 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -815.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | 297000.0 | Unaccompanied | Working | Secondary / secondary special | Civil marriage | House / apartment | 0.008019 | -19005 | -3039 | -9833.0 | -2437 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Laborers | 2.0 | 2 | 2 | WEDNESDAY | 17 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | NaN | 0.650442 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 0.0 | 2.0 | 0.0 | -617.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | 513000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.028663 | -19932 | -3038 | -4311.0 | -3458 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Core staff | 1.0 | 2 | 2 | THURSDAY | 11 | 0 | 0 | 0 | 0 | 1 | 1 | Religion | NaN | 0.322738 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -1106.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 307506 | 456251 | 0 | Cash loans | M | N | N | 0 | 157500.0 | 254700.0 | 27558.0 | 225000.0 | Unaccompanied | Working | Secondary / secondary special | Separated | With parents | 0.032561 | -9327 | -236 | -8456.0 | -1982 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Sales staff | 1.0 | 1 | 1 | THURSDAY | 15 | 0 | 0 | 0 | 0 | 0 | 0 | Services | 0.145570 | 0.681632 | NaN | 0.2021 | 0.0887 | 0.9876 | 0.8300 | 0.0202 | 0.22 | 0.1034 | 0.6042 | 0.2708 | 0.0594 | 0.1484 | 0.1965 | 0.0753 | 0.1095 | 0.1008 | 0.0172 | 0.9782 | 0.7125 | 0.0172 | 0.0806 | 0.0345 | 0.4583 | 0.0417 | 0.0094 | 0.0882 | 0.0853 | 0.0 | 0.0125 | 0.2040 | 0.0887 | 0.9876 | 0.8323 | 0.0203 | 0.22 | 0.1034 | 0.6042 | 0.2708 | 0.0605 | 0.1509 | 0.2001 | 0.0757 | 0.1118 | reg oper account | block of flats | 0.2898 | Stone, brick | No | 0.0 | 0.0 | 0.0 | 0.0 | -273.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 307507 | 456252 | 0 | Cash loans | F | N | Y | 0 | 72000.0 | 269550.0 | 12001.5 | 225000.0 | Unaccompanied | Pensioner | Secondary / secondary special | Widow | House / apartment | 0.025164 | -20775 | 365243 | -4388.0 | -4090 | NaN | 1 | 0 | 0 | 1 | 1 | 0 | NaN | 1.0 | 2 | 2 | MONDAY | 8 | 0 | 0 | 0 | 0 | 0 | 0 | XNA | NaN | 0.115992 | NaN | 0.0247 | 0.0435 | 0.9727 | 0.6260 | 0.0022 | 0.00 | 0.1034 | 0.0833 | 0.1250 | 0.0579 | 0.0202 | 0.0257 | 0.0000 | 0.0000 | 0.0252 | 0.0451 | 0.9727 | 0.6406 | 0.0022 | 0.0000 | 0.1034 | 0.0833 | 0.1250 | 0.0592 | 0.0220 | 0.0267 | 0.0 | 0.0000 | 0.0250 | 0.0435 | 0.9727 | 0.6310 | 0.0022 | 0.00 | 0.1034 | 0.0833 | 0.1250 | 0.0589 | 0.0205 | 0.0261 | 0.0000 | 0.0000 | reg oper account | block of flats | 0.0214 | Stone, brick | No | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 307508 | 456253 | 0 | Cash loans | F | N | Y | 0 | 153000.0 | 677664.0 | 29979.0 | 585000.0 | Unaccompanied | Working | Higher education | Separated | House / apartment | 0.005002 | -14966 | -7921 | -6737.0 | -5150 | NaN | 1 | 1 | 0 | 1 | 0 | 1 | Managers | 1.0 | 3 | 3 | THURSDAY | 9 | 0 | 0 | 0 | 0 | 1 | 1 | School | 0.744026 | 0.535722 | 0.218859 | 0.1031 | 0.0862 | 0.9816 | 0.7484 | 0.0123 | 0.00 | 0.2069 | 0.1667 | 0.2083 | NaN | 0.0841 | 0.9279 | 0.0000 | 0.0000 | 0.1050 | 0.0894 | 0.9816 | 0.7583 | 0.0124 | 0.0000 | 0.2069 | 0.1667 | 0.2083 | NaN | 0.0918 | 0.9667 | 0.0 | 0.0000 | 0.1041 | 0.0862 | 0.9816 | 0.7518 | 0.0124 | 0.00 | 0.2069 | 0.1667 | 0.2083 | NaN | 0.0855 | 0.9445 | 0.0000 | 0.0000 | reg oper account | block of flats | 0.7970 | Panel | No | 6.0 | 0.0 | 6.0 | 0.0 | -1909.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 307509 | 456254 | 1 | Cash loans | F | N | Y | 0 | 171000.0 | 370107.0 | 20205.0 | 319500.0 | Unaccompanied | Commercial associate | Secondary / secondary special | Married | House / apartment | 0.005313 | -11961 | -4786 | -2562.0 | -931 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Laborers | 2.0 | 2 | 2 | WEDNESDAY | 9 | 0 | 0 | 0 | 1 | 1 | 0 | Business Entity Type 1 | NaN | 0.514163 | 0.661024 | 0.0124 | NaN | 0.9771 | NaN | NaN | NaN | 0.0690 | 0.0417 | NaN | NaN | NaN | 0.0061 | NaN | NaN | 0.0126 | NaN | 0.9772 | NaN | NaN | NaN | 0.0690 | 0.0417 | NaN | NaN | NaN | 0.0063 | NaN | NaN | 0.0125 | NaN | 0.9771 | NaN | NaN | NaN | 0.0690 | 0.0417 | NaN | NaN | NaN | 0.0062 | NaN | NaN | NaN | block of flats | 0.0086 | Stone, brick | No | 0.0 | 0.0 | 0.0 | 0.0 | -322.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 307510 | 456255 | 0 | Cash loans | F | N | N | 0 | 157500.0 | 675000.0 | 49117.5 | 675000.0 | Unaccompanied | Commercial associate | Higher education | Married | House / apartment | 0.046220 | -16856 | -1262 | -5128.0 | -410 | NaN | 1 | 1 | 1 | 1 | 1 | 0 | Laborers | 2.0 | 1 | 1 | THURSDAY | 20 | 0 | 0 | 0 | 0 | 1 | 1 | Business Entity Type 3 | 0.734460 | 0.708569 | 0.113922 | 0.0742 | 0.0526 | 0.9881 | NaN | 0.0176 | 0.08 | 0.0690 | 0.3750 | NaN | NaN | NaN | 0.0791 | NaN | 0.0000 | 0.0756 | 0.0546 | 0.9881 | NaN | 0.0178 | 0.0806 | 0.0690 | 0.3750 | NaN | NaN | NaN | 0.0824 | NaN | 0.0000 | 0.0749 | 0.0526 | 0.9881 | NaN | 0.0177 | 0.08 | 0.0690 | 0.3750 | NaN | NaN | NaN | 0.0805 | NaN | 0.0000 | NaN | block of flats | 0.0718 | Panel | No | 0.0 | 0.0 | 0.0 | 0.0 | -787.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 |
307511 rows × 122 columns
bank_data.head()
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | FONDKAPREMONT_MODE | HOUSETYPE_MODE | TOTALAREA_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.018801 | -9461 | -637 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Laborers | 1.0 | 2 | 2 | WEDNESDAY | 10 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | 0.9722 | 0.6341 | 0.0144 | 0.0000 | 0.0690 | 0.0833 | 0.1250 | 0.0377 | 0.022 | 0.0198 | 0.0 | 0.0 | 0.0250 | 0.0369 | 0.9722 | 0.6243 | 0.0144 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0375 | 0.0205 | 0.0193 | 0.0000 | 0.00 | reg oper account | block of flats | 0.0149 | Stone, brick | No | 2.0 | 2.0 | 2.0 | 2.0 | -1134.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | Family | State servant | Higher education | Married | House / apartment | 0.003541 | -16765 | -1188 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Core staff | 2.0 | 1 | 1 | MONDAY | 11 | 0 | 0 | 0 | 0 | 0 | 0 | School | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | 0.9851 | 0.8040 | 0.0497 | 0.0806 | 0.0345 | 0.2917 | 0.3333 | 0.0128 | 0.079 | 0.0554 | 0.0 | 0.0 | 0.0968 | 0.0529 | 0.9851 | 0.7987 | 0.0608 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0132 | 0.0787 | 0.0558 | 0.0039 | 0.01 | reg oper account | block of flats | 0.0714 | Block | No | 1.0 | 0.0 | 1.0 | 0.0 | -828.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | 135000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.010032 | -19046 | -225 | -4260.0 | -2531 | 26.0 | 1 | 1 | 1 | 1 | 1 | 0 | Laborers | 1.0 | 2 | 2 | MONDAY | 9 | 0 | 0 | 0 | 0 | 0 | 0 | Government | NaN | 0.555912 | 0.729567 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -815.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | 297000.0 | Unaccompanied | Working | Secondary / secondary special | Civil marriage | House / apartment | 0.008019 | -19005 | -3039 | -9833.0 | -2437 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Laborers | 2.0 | 2 | 2 | WEDNESDAY | 17 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | NaN | 0.650442 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 0.0 | 2.0 | 0.0 | -617.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | 513000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.028663 | -19932 | -3038 | -4311.0 | -3458 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Core staff | 1.0 | 2 | 2 | THURSDAY | 11 | 0 | 0 | 0 | 0 | 1 | 1 | Religion | NaN | 0.322738 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -1106.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
bank_data.tail()
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | FONDKAPREMONT_MODE | HOUSETYPE_MODE | TOTALAREA_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 307506 | 456251 | 0 | Cash loans | M | N | N | 0 | 157500.0 | 254700.0 | 27558.0 | 225000.0 | Unaccompanied | Working | Secondary / secondary special | Separated | With parents | 0.032561 | -9327 | -236 | -8456.0 | -1982 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Sales staff | 1.0 | 1 | 1 | THURSDAY | 15 | 0 | 0 | 0 | 0 | 0 | 0 | Services | 0.145570 | 0.681632 | NaN | 0.2021 | 0.0887 | 0.9876 | 0.8300 | 0.0202 | 0.22 | 0.1034 | 0.6042 | 0.2708 | 0.0594 | 0.1484 | 0.1965 | 0.0753 | 0.1095 | 0.1008 | 0.0172 | 0.9782 | 0.7125 | 0.0172 | 0.0806 | 0.0345 | 0.4583 | 0.0417 | 0.0094 | 0.0882 | 0.0853 | 0.0 | 0.0125 | 0.2040 | 0.0887 | 0.9876 | 0.8323 | 0.0203 | 0.22 | 0.1034 | 0.6042 | 0.2708 | 0.0605 | 0.1509 | 0.2001 | 0.0757 | 0.1118 | reg oper account | block of flats | 0.2898 | Stone, brick | No | 0.0 | 0.0 | 0.0 | 0.0 | -273.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 307507 | 456252 | 0 | Cash loans | F | N | Y | 0 | 72000.0 | 269550.0 | 12001.5 | 225000.0 | Unaccompanied | Pensioner | Secondary / secondary special | Widow | House / apartment | 0.025164 | -20775 | 365243 | -4388.0 | -4090 | NaN | 1 | 0 | 0 | 1 | 1 | 0 | NaN | 1.0 | 2 | 2 | MONDAY | 8 | 0 | 0 | 0 | 0 | 0 | 0 | XNA | NaN | 0.115992 | NaN | 0.0247 | 0.0435 | 0.9727 | 0.6260 | 0.0022 | 0.00 | 0.1034 | 0.0833 | 0.1250 | 0.0579 | 0.0202 | 0.0257 | 0.0000 | 0.0000 | 0.0252 | 0.0451 | 0.9727 | 0.6406 | 0.0022 | 0.0000 | 0.1034 | 0.0833 | 0.1250 | 0.0592 | 0.0220 | 0.0267 | 0.0 | 0.0000 | 0.0250 | 0.0435 | 0.9727 | 0.6310 | 0.0022 | 0.00 | 0.1034 | 0.0833 | 0.1250 | 0.0589 | 0.0205 | 0.0261 | 0.0000 | 0.0000 | reg oper account | block of flats | 0.0214 | Stone, brick | No | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 307508 | 456253 | 0 | Cash loans | F | N | Y | 0 | 153000.0 | 677664.0 | 29979.0 | 585000.0 | Unaccompanied | Working | Higher education | Separated | House / apartment | 0.005002 | -14966 | -7921 | -6737.0 | -5150 | NaN | 1 | 1 | 0 | 1 | 0 | 1 | Managers | 1.0 | 3 | 3 | THURSDAY | 9 | 0 | 0 | 0 | 0 | 1 | 1 | School | 0.744026 | 0.535722 | 0.218859 | 0.1031 | 0.0862 | 0.9816 | 0.7484 | 0.0123 | 0.00 | 0.2069 | 0.1667 | 0.2083 | NaN | 0.0841 | 0.9279 | 0.0000 | 0.0000 | 0.1050 | 0.0894 | 0.9816 | 0.7583 | 0.0124 | 0.0000 | 0.2069 | 0.1667 | 0.2083 | NaN | 0.0918 | 0.9667 | 0.0 | 0.0000 | 0.1041 | 0.0862 | 0.9816 | 0.7518 | 0.0124 | 0.00 | 0.2069 | 0.1667 | 0.2083 | NaN | 0.0855 | 0.9445 | 0.0000 | 0.0000 | reg oper account | block of flats | 0.7970 | Panel | No | 6.0 | 0.0 | 6.0 | 0.0 | -1909.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 307509 | 456254 | 1 | Cash loans | F | N | Y | 0 | 171000.0 | 370107.0 | 20205.0 | 319500.0 | Unaccompanied | Commercial associate | Secondary / secondary special | Married | House / apartment | 0.005313 | -11961 | -4786 | -2562.0 | -931 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Laborers | 2.0 | 2 | 2 | WEDNESDAY | 9 | 0 | 0 | 0 | 1 | 1 | 0 | Business Entity Type 1 | NaN | 0.514163 | 0.661024 | 0.0124 | NaN | 0.9771 | NaN | NaN | NaN | 0.0690 | 0.0417 | NaN | NaN | NaN | 0.0061 | NaN | NaN | 0.0126 | NaN | 0.9772 | NaN | NaN | NaN | 0.0690 | 0.0417 | NaN | NaN | NaN | 0.0063 | NaN | NaN | 0.0125 | NaN | 0.9771 | NaN | NaN | NaN | 0.0690 | 0.0417 | NaN | NaN | NaN | 0.0062 | NaN | NaN | NaN | block of flats | 0.0086 | Stone, brick | No | 0.0 | 0.0 | 0.0 | 0.0 | -322.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 307510 | 456255 | 0 | Cash loans | F | N | N | 0 | 157500.0 | 675000.0 | 49117.5 | 675000.0 | Unaccompanied | Commercial associate | Higher education | Married | House / apartment | 0.046220 | -16856 | -1262 | -5128.0 | -410 | NaN | 1 | 1 | 1 | 1 | 1 | 0 | Laborers | 2.0 | 1 | 1 | THURSDAY | 20 | 0 | 0 | 0 | 0 | 1 | 1 | Business Entity Type 3 | 0.734460 | 0.708569 | 0.113922 | 0.0742 | 0.0526 | 0.9881 | NaN | 0.0176 | 0.08 | 0.0690 | 0.3750 | NaN | NaN | NaN | 0.0791 | NaN | 0.0000 | 0.0756 | 0.0546 | 0.9881 | NaN | 0.0178 | 0.0806 | 0.0690 | 0.3750 | NaN | NaN | NaN | 0.0824 | NaN | 0.0000 | 0.0749 | 0.0526 | 0.9881 | NaN | 0.0177 | 0.08 | 0.0690 | 0.3750 | NaN | NaN | NaN | 0.0805 | NaN | 0.0000 | NaN | block of flats | 0.0718 | Panel | No | 0.0 | 0.0 | 0.0 | 0.0 | -787.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 |
bank_data.shape
(307511, 122)
bank_data.size
37516342
bank_data.info(verbose=True)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307511 entries, 0 to 307510 Data columns (total 122 columns): # Column Dtype --- ------ ----- 0 SK_ID_CURR int64 1 TARGET int64 2 NAME_CONTRACT_TYPE object 3 CODE_GENDER object 4 FLAG_OWN_CAR object 5 FLAG_OWN_REALTY object 6 CNT_CHILDREN int64 7 AMT_INCOME_TOTAL float64 8 AMT_CREDIT float64 9 AMT_ANNUITY float64 10 AMT_GOODS_PRICE float64 11 NAME_TYPE_SUITE object 12 NAME_INCOME_TYPE object 13 NAME_EDUCATION_TYPE object 14 NAME_FAMILY_STATUS object 15 NAME_HOUSING_TYPE object 16 REGION_POPULATION_RELATIVE float64 17 DAYS_BIRTH int64 18 DAYS_EMPLOYED int64 19 DAYS_REGISTRATION float64 20 DAYS_ID_PUBLISH int64 21 OWN_CAR_AGE float64 22 FLAG_MOBIL int64 23 FLAG_EMP_PHONE int64 24 FLAG_WORK_PHONE int64 25 FLAG_CONT_MOBILE int64 26 FLAG_PHONE int64 27 FLAG_EMAIL int64 28 OCCUPATION_TYPE object 29 CNT_FAM_MEMBERS float64 30 REGION_RATING_CLIENT int64 31 REGION_RATING_CLIENT_W_CITY int64 32 WEEKDAY_APPR_PROCESS_START object 33 HOUR_APPR_PROCESS_START int64 34 REG_REGION_NOT_LIVE_REGION int64 35 REG_REGION_NOT_WORK_REGION int64 36 LIVE_REGION_NOT_WORK_REGION int64 37 REG_CITY_NOT_LIVE_CITY int64 38 REG_CITY_NOT_WORK_CITY int64 39 LIVE_CITY_NOT_WORK_CITY int64 40 ORGANIZATION_TYPE object 41 EXT_SOURCE_1 float64 42 EXT_SOURCE_2 float64 43 EXT_SOURCE_3 float64 44 APARTMENTS_AVG float64 45 BASEMENTAREA_AVG float64 46 YEARS_BEGINEXPLUATATION_AVG float64 47 YEARS_BUILD_AVG float64 48 COMMONAREA_AVG float64 49 ELEVATORS_AVG float64 50 ENTRANCES_AVG float64 51 FLOORSMAX_AVG float64 52 FLOORSMIN_AVG float64 53 LANDAREA_AVG float64 54 LIVINGAPARTMENTS_AVG float64 55 LIVINGAREA_AVG float64 56 NONLIVINGAPARTMENTS_AVG float64 57 NONLIVINGAREA_AVG float64 58 APARTMENTS_MODE float64 59 BASEMENTAREA_MODE float64 60 YEARS_BEGINEXPLUATATION_MODE float64 61 YEARS_BUILD_MODE float64 62 COMMONAREA_MODE float64 63 ELEVATORS_MODE float64 64 ENTRANCES_MODE float64 65 FLOORSMAX_MODE float64 66 FLOORSMIN_MODE float64 67 LANDAREA_MODE float64 68 LIVINGAPARTMENTS_MODE float64 69 LIVINGAREA_MODE float64 70 NONLIVINGAPARTMENTS_MODE float64 71 NONLIVINGAREA_MODE float64 72 APARTMENTS_MEDI float64 73 BASEMENTAREA_MEDI float64 74 YEARS_BEGINEXPLUATATION_MEDI float64 75 YEARS_BUILD_MEDI float64 76 COMMONAREA_MEDI float64 77 ELEVATORS_MEDI float64 78 ENTRANCES_MEDI float64 79 FLOORSMAX_MEDI float64 80 FLOORSMIN_MEDI float64 81 LANDAREA_MEDI float64 82 LIVINGAPARTMENTS_MEDI float64 83 LIVINGAREA_MEDI float64 84 NONLIVINGAPARTMENTS_MEDI float64 85 NONLIVINGAREA_MEDI float64 86 FONDKAPREMONT_MODE object 87 HOUSETYPE_MODE object 88 TOTALAREA_MODE float64 89 WALLSMATERIAL_MODE object 90 EMERGENCYSTATE_MODE object 91 OBS_30_CNT_SOCIAL_CIRCLE float64 92 DEF_30_CNT_SOCIAL_CIRCLE float64 93 OBS_60_CNT_SOCIAL_CIRCLE float64 94 DEF_60_CNT_SOCIAL_CIRCLE float64 95 DAYS_LAST_PHONE_CHANGE float64 96 FLAG_DOCUMENT_2 int64 97 FLAG_DOCUMENT_3 int64 98 FLAG_DOCUMENT_4 int64 99 FLAG_DOCUMENT_5 int64 100 FLAG_DOCUMENT_6 int64 101 FLAG_DOCUMENT_7 int64 102 FLAG_DOCUMENT_8 int64 103 FLAG_DOCUMENT_9 int64 104 FLAG_DOCUMENT_10 int64 105 FLAG_DOCUMENT_11 int64 106 FLAG_DOCUMENT_12 int64 107 FLAG_DOCUMENT_13 int64 108 FLAG_DOCUMENT_14 int64 109 FLAG_DOCUMENT_15 int64 110 FLAG_DOCUMENT_16 int64 111 FLAG_DOCUMENT_17 int64 112 FLAG_DOCUMENT_18 int64 113 FLAG_DOCUMENT_19 int64 114 FLAG_DOCUMENT_20 int64 115 FLAG_DOCUMENT_21 int64 116 AMT_REQ_CREDIT_BUREAU_HOUR float64 117 AMT_REQ_CREDIT_BUREAU_DAY float64 118 AMT_REQ_CREDIT_BUREAU_WEEK float64 119 AMT_REQ_CREDIT_BUREAU_MON float64 120 AMT_REQ_CREDIT_BUREAU_QRT float64 121 AMT_REQ_CREDIT_BUREAU_YEAR float64 dtypes: float64(65), int64(41), object(16) memory usage: 286.2+ MB
bank_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307511 entries, 0 to 307510 Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR dtypes: float64(65), int64(41), object(16) memory usage: 286.2+ MB
bank_data.dtypes
SK_ID_CURR int64 TARGET int64 NAME_CONTRACT_TYPE object CODE_GENDER object FLAG_OWN_CAR object FLAG_OWN_REALTY object CNT_CHILDREN int64 AMT_INCOME_TOTAL float64 AMT_CREDIT float64 AMT_ANNUITY float64 AMT_GOODS_PRICE float64 NAME_TYPE_SUITE object NAME_INCOME_TYPE object NAME_EDUCATION_TYPE object NAME_FAMILY_STATUS object NAME_HOUSING_TYPE object REGION_POPULATION_RELATIVE float64 DAYS_BIRTH int64 DAYS_EMPLOYED int64 DAYS_REGISTRATION float64 DAYS_ID_PUBLISH int64 OWN_CAR_AGE float64 FLAG_MOBIL int64 FLAG_EMP_PHONE int64 FLAG_WORK_PHONE int64 FLAG_CONT_MOBILE int64 FLAG_PHONE int64 FLAG_EMAIL int64 OCCUPATION_TYPE object CNT_FAM_MEMBERS float64 REGION_RATING_CLIENT int64 REGION_RATING_CLIENT_W_CITY int64 WEEKDAY_APPR_PROCESS_START object HOUR_APPR_PROCESS_START int64 REG_REGION_NOT_LIVE_REGION int64 REG_REGION_NOT_WORK_REGION int64 LIVE_REGION_NOT_WORK_REGION int64 REG_CITY_NOT_LIVE_CITY int64 REG_CITY_NOT_WORK_CITY int64 LIVE_CITY_NOT_WORK_CITY int64 ORGANIZATION_TYPE object EXT_SOURCE_1 float64 EXT_SOURCE_2 float64 EXT_SOURCE_3 float64 APARTMENTS_AVG float64 BASEMENTAREA_AVG float64 YEARS_BEGINEXPLUATATION_AVG float64 YEARS_BUILD_AVG float64 COMMONAREA_AVG float64 ELEVATORS_AVG float64 ENTRANCES_AVG float64 FLOORSMAX_AVG float64 FLOORSMIN_AVG float64 LANDAREA_AVG float64 LIVINGAPARTMENTS_AVG float64 LIVINGAREA_AVG float64 NONLIVINGAPARTMENTS_AVG float64 NONLIVINGAREA_AVG float64 APARTMENTS_MODE float64 BASEMENTAREA_MODE float64 YEARS_BEGINEXPLUATATION_MODE float64 YEARS_BUILD_MODE float64 COMMONAREA_MODE float64 ELEVATORS_MODE float64 ENTRANCES_MODE float64 FLOORSMAX_MODE float64 FLOORSMIN_MODE float64 LANDAREA_MODE float64 LIVINGAPARTMENTS_MODE float64 LIVINGAREA_MODE float64 NONLIVINGAPARTMENTS_MODE float64 NONLIVINGAREA_MODE float64 APARTMENTS_MEDI float64 BASEMENTAREA_MEDI float64 YEARS_BEGINEXPLUATATION_MEDI float64 YEARS_BUILD_MEDI float64 COMMONAREA_MEDI float64 ELEVATORS_MEDI float64 ENTRANCES_MEDI float64 FLOORSMAX_MEDI float64 FLOORSMIN_MEDI float64 LANDAREA_MEDI float64 LIVINGAPARTMENTS_MEDI float64 LIVINGAREA_MEDI float64 NONLIVINGAPARTMENTS_MEDI float64 NONLIVINGAREA_MEDI float64 FONDKAPREMONT_MODE object HOUSETYPE_MODE object TOTALAREA_MODE float64 WALLSMATERIAL_MODE object EMERGENCYSTATE_MODE object OBS_30_CNT_SOCIAL_CIRCLE float64 DEF_30_CNT_SOCIAL_CIRCLE float64 OBS_60_CNT_SOCIAL_CIRCLE float64 DEF_60_CNT_SOCIAL_CIRCLE float64 DAYS_LAST_PHONE_CHANGE float64 FLAG_DOCUMENT_2 int64 FLAG_DOCUMENT_3 int64 FLAG_DOCUMENT_4 int64 FLAG_DOCUMENT_5 int64 FLAG_DOCUMENT_6 int64 FLAG_DOCUMENT_7 int64 FLAG_DOCUMENT_8 int64 FLAG_DOCUMENT_9 int64 FLAG_DOCUMENT_10 int64 FLAG_DOCUMENT_11 int64 FLAG_DOCUMENT_12 int64 FLAG_DOCUMENT_13 int64 FLAG_DOCUMENT_14 int64 FLAG_DOCUMENT_15 int64 FLAG_DOCUMENT_16 int64 FLAG_DOCUMENT_17 int64 FLAG_DOCUMENT_18 int64 FLAG_DOCUMENT_19 int64 FLAG_DOCUMENT_20 int64 FLAG_DOCUMENT_21 int64 AMT_REQ_CREDIT_BUREAU_HOUR float64 AMT_REQ_CREDIT_BUREAU_DAY float64 AMT_REQ_CREDIT_BUREAU_WEEK float64 AMT_REQ_CREDIT_BUREAU_MON float64 AMT_REQ_CREDIT_BUREAU_QRT float64 AMT_REQ_CREDIT_BUREAU_YEAR float64 dtype: object
bank_data.isnull().sum()
SK_ID_CURR 0 TARGET 0 NAME_CONTRACT_TYPE 0 CODE_GENDER 0 FLAG_OWN_CAR 0 FLAG_OWN_REALTY 0 CNT_CHILDREN 0 AMT_INCOME_TOTAL 0 AMT_CREDIT 0 AMT_ANNUITY 12 AMT_GOODS_PRICE 278 NAME_TYPE_SUITE 1292 NAME_INCOME_TYPE 0 NAME_EDUCATION_TYPE 0 NAME_FAMILY_STATUS 0 NAME_HOUSING_TYPE 0 REGION_POPULATION_RELATIVE 0 DAYS_BIRTH 0 DAYS_EMPLOYED 0 DAYS_REGISTRATION 0 DAYS_ID_PUBLISH 0 OWN_CAR_AGE 202929 FLAG_MOBIL 0 FLAG_EMP_PHONE 0 FLAG_WORK_PHONE 0 FLAG_CONT_MOBILE 0 FLAG_PHONE 0 FLAG_EMAIL 0 OCCUPATION_TYPE 96391 CNT_FAM_MEMBERS 2 REGION_RATING_CLIENT 0 REGION_RATING_CLIENT_W_CITY 0 WEEKDAY_APPR_PROCESS_START 0 HOUR_APPR_PROCESS_START 0 REG_REGION_NOT_LIVE_REGION 0 REG_REGION_NOT_WORK_REGION 0 LIVE_REGION_NOT_WORK_REGION 0 REG_CITY_NOT_LIVE_CITY 0 REG_CITY_NOT_WORK_CITY 0 LIVE_CITY_NOT_WORK_CITY 0 ORGANIZATION_TYPE 0 EXT_SOURCE_1 173378 EXT_SOURCE_2 660 EXT_SOURCE_3 60965 APARTMENTS_AVG 156061 BASEMENTAREA_AVG 179943 YEARS_BEGINEXPLUATATION_AVG 150007 YEARS_BUILD_AVG 204488 COMMONAREA_AVG 214865 ELEVATORS_AVG 163891 ENTRANCES_AVG 154828 FLOORSMAX_AVG 153020 FLOORSMIN_AVG 208642 LANDAREA_AVG 182590 LIVINGAPARTMENTS_AVG 210199 LIVINGAREA_AVG 154350 NONLIVINGAPARTMENTS_AVG 213514 NONLIVINGAREA_AVG 169682 APARTMENTS_MODE 156061 BASEMENTAREA_MODE 179943 YEARS_BEGINEXPLUATATION_MODE 150007 YEARS_BUILD_MODE 204488 COMMONAREA_MODE 214865 ELEVATORS_MODE 163891 ENTRANCES_MODE 154828 FLOORSMAX_MODE 153020 FLOORSMIN_MODE 208642 LANDAREA_MODE 182590 LIVINGAPARTMENTS_MODE 210199 LIVINGAREA_MODE 154350 NONLIVINGAPARTMENTS_MODE 213514 NONLIVINGAREA_MODE 169682 APARTMENTS_MEDI 156061 BASEMENTAREA_MEDI 179943 YEARS_BEGINEXPLUATATION_MEDI 150007 YEARS_BUILD_MEDI 204488 COMMONAREA_MEDI 214865 ELEVATORS_MEDI 163891 ENTRANCES_MEDI 154828 FLOORSMAX_MEDI 153020 FLOORSMIN_MEDI 208642 LANDAREA_MEDI 182590 LIVINGAPARTMENTS_MEDI 210199 LIVINGAREA_MEDI 154350 NONLIVINGAPARTMENTS_MEDI 213514 NONLIVINGAREA_MEDI 169682 FONDKAPREMONT_MODE 210295 HOUSETYPE_MODE 154297 TOTALAREA_MODE 148431 WALLSMATERIAL_MODE 156341 EMERGENCYSTATE_MODE 145755 OBS_30_CNT_SOCIAL_CIRCLE 1021 DEF_30_CNT_SOCIAL_CIRCLE 1021 OBS_60_CNT_SOCIAL_CIRCLE 1021 DEF_60_CNT_SOCIAL_CIRCLE 1021 DAYS_LAST_PHONE_CHANGE 1 FLAG_DOCUMENT_2 0 FLAG_DOCUMENT_3 0 FLAG_DOCUMENT_4 0 FLAG_DOCUMENT_5 0 FLAG_DOCUMENT_6 0 FLAG_DOCUMENT_7 0 FLAG_DOCUMENT_8 0 FLAG_DOCUMENT_9 0 FLAG_DOCUMENT_10 0 FLAG_DOCUMENT_11 0 FLAG_DOCUMENT_12 0 FLAG_DOCUMENT_13 0 FLAG_DOCUMENT_14 0 FLAG_DOCUMENT_15 0 FLAG_DOCUMENT_16 0 FLAG_DOCUMENT_17 0 FLAG_DOCUMENT_18 0 FLAG_DOCUMENT_19 0 FLAG_DOCUMENT_20 0 FLAG_DOCUMENT_21 0 AMT_REQ_CREDIT_BUREAU_HOUR 41519 AMT_REQ_CREDIT_BUREAU_DAY 41519 AMT_REQ_CREDIT_BUREAU_WEEK 41519 AMT_REQ_CREDIT_BUREAU_MON 41519 AMT_REQ_CREDIT_BUREAU_QRT 41519 AMT_REQ_CREDIT_BUREAU_YEAR 41519 dtype: int64
bank_data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | 307511.0 | 278180.518577 | 102790.175348 | 1.000020e+05 | 189145.500000 | 278202.000000 | 367142.500000 | 4.562550e+05 |
| TARGET | 307511.0 | 0.080729 | 0.272419 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| CNT_CHILDREN | 307511.0 | 0.417052 | 0.722121 | 0.000000e+00 | 0.000000 | 0.000000 | 1.000000 | 1.900000e+01 |
| AMT_INCOME_TOTAL | 307511.0 | 168797.919297 | 237123.146279 | 2.565000e+04 | 112500.000000 | 147150.000000 | 202500.000000 | 1.170000e+08 |
| AMT_CREDIT | 307511.0 | 599025.999706 | 402490.776996 | 4.500000e+04 | 270000.000000 | 513531.000000 | 808650.000000 | 4.050000e+06 |
| AMT_ANNUITY | 307499.0 | 27108.573909 | 14493.737315 | 1.615500e+03 | 16524.000000 | 24903.000000 | 34596.000000 | 2.580255e+05 |
| AMT_GOODS_PRICE | 307233.0 | 538396.207429 | 369446.460540 | 4.050000e+04 | 238500.000000 | 450000.000000 | 679500.000000 | 4.050000e+06 |
| REGION_POPULATION_RELATIVE | 307511.0 | 0.020868 | 0.013831 | 2.900000e-04 | 0.010006 | 0.018850 | 0.028663 | 7.250800e-02 |
| DAYS_BIRTH | 307511.0 | -16036.995067 | 4363.988632 | -2.522900e+04 | -19682.000000 | -15750.000000 | -12413.000000 | -7.489000e+03 |
| DAYS_EMPLOYED | 307511.0 | 63815.045904 | 141275.766519 | -1.791200e+04 | -2760.000000 | -1213.000000 | -289.000000 | 3.652430e+05 |
| DAYS_REGISTRATION | 307511.0 | -4986.120328 | 3522.886321 | -2.467200e+04 | -7479.500000 | -4504.000000 | -2010.000000 | 0.000000e+00 |
| DAYS_ID_PUBLISH | 307511.0 | -2994.202373 | 1509.450419 | -7.197000e+03 | -4299.000000 | -3254.000000 | -1720.000000 | 0.000000e+00 |
| OWN_CAR_AGE | 104582.0 | 12.061091 | 11.944812 | 0.000000e+00 | 5.000000 | 9.000000 | 15.000000 | 9.100000e+01 |
| FLAG_MOBIL | 307511.0 | 0.999997 | 0.001803 | 0.000000e+00 | 1.000000 | 1.000000 | 1.000000 | 1.000000e+00 |
| FLAG_EMP_PHONE | 307511.0 | 0.819889 | 0.384280 | 0.000000e+00 | 1.000000 | 1.000000 | 1.000000 | 1.000000e+00 |
| FLAG_WORK_PHONE | 307511.0 | 0.199368 | 0.399526 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_CONT_MOBILE | 307511.0 | 0.998133 | 0.043164 | 0.000000e+00 | 1.000000 | 1.000000 | 1.000000 | 1.000000e+00 |
| FLAG_PHONE | 307511.0 | 0.281066 | 0.449521 | 0.000000e+00 | 0.000000 | 0.000000 | 1.000000 | 1.000000e+00 |
| FLAG_EMAIL | 307511.0 | 0.056720 | 0.231307 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| CNT_FAM_MEMBERS | 307509.0 | 2.152665 | 0.910682 | 1.000000e+00 | 2.000000 | 2.000000 | 3.000000 | 2.000000e+01 |
| REGION_RATING_CLIENT | 307511.0 | 2.052463 | 0.509034 | 1.000000e+00 | 2.000000 | 2.000000 | 2.000000 | 3.000000e+00 |
| REGION_RATING_CLIENT_W_CITY | 307511.0 | 2.031521 | 0.502737 | 1.000000e+00 | 2.000000 | 2.000000 | 2.000000 | 3.000000e+00 |
| HOUR_APPR_PROCESS_START | 307511.0 | 12.063419 | 3.265832 | 0.000000e+00 | 10.000000 | 12.000000 | 14.000000 | 2.300000e+01 |
| REG_REGION_NOT_LIVE_REGION | 307511.0 | 0.015144 | 0.122126 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| REG_REGION_NOT_WORK_REGION | 307511.0 | 0.050769 | 0.219526 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| LIVE_REGION_NOT_WORK_REGION | 307511.0 | 0.040659 | 0.197499 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| REG_CITY_NOT_LIVE_CITY | 307511.0 | 0.078173 | 0.268444 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| REG_CITY_NOT_WORK_CITY | 307511.0 | 0.230454 | 0.421124 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| LIVE_CITY_NOT_WORK_CITY | 307511.0 | 0.179555 | 0.383817 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| EXT_SOURCE_1 | 134133.0 | 0.502130 | 0.211062 | 1.456813e-02 | 0.334007 | 0.505998 | 0.675053 | 9.626928e-01 |
| EXT_SOURCE_2 | 306851.0 | 0.514393 | 0.191060 | 8.170000e-08 | 0.392457 | 0.565961 | 0.663617 | 8.549997e-01 |
| EXT_SOURCE_3 | 246546.0 | 0.510853 | 0.194844 | 5.272650e-04 | 0.370650 | 0.535276 | 0.669057 | 8.960095e-01 |
| APARTMENTS_AVG | 151450.0 | 0.117440 | 0.108240 | 0.000000e+00 | 0.057700 | 0.087600 | 0.148500 | 1.000000e+00 |
| BASEMENTAREA_AVG | 127568.0 | 0.088442 | 0.082438 | 0.000000e+00 | 0.044200 | 0.076300 | 0.112200 | 1.000000e+00 |
| YEARS_BEGINEXPLUATATION_AVG | 157504.0 | 0.977735 | 0.059223 | 0.000000e+00 | 0.976700 | 0.981600 | 0.986600 | 1.000000e+00 |
| YEARS_BUILD_AVG | 103023.0 | 0.752471 | 0.113280 | 0.000000e+00 | 0.687200 | 0.755200 | 0.823200 | 1.000000e+00 |
| COMMONAREA_AVG | 92646.0 | 0.044621 | 0.076036 | 0.000000e+00 | 0.007800 | 0.021100 | 0.051500 | 1.000000e+00 |
| ELEVATORS_AVG | 143620.0 | 0.078942 | 0.134576 | 0.000000e+00 | 0.000000 | 0.000000 | 0.120000 | 1.000000e+00 |
| ENTRANCES_AVG | 152683.0 | 0.149725 | 0.100049 | 0.000000e+00 | 0.069000 | 0.137900 | 0.206900 | 1.000000e+00 |
| FLOORSMAX_AVG | 154491.0 | 0.226282 | 0.144641 | 0.000000e+00 | 0.166700 | 0.166700 | 0.333300 | 1.000000e+00 |
| FLOORSMIN_AVG | 98869.0 | 0.231894 | 0.161380 | 0.000000e+00 | 0.083300 | 0.208300 | 0.375000 | 1.000000e+00 |
| LANDAREA_AVG | 124921.0 | 0.066333 | 0.081184 | 0.000000e+00 | 0.018700 | 0.048100 | 0.085600 | 1.000000e+00 |
| LIVINGAPARTMENTS_AVG | 97312.0 | 0.100775 | 0.092576 | 0.000000e+00 | 0.050400 | 0.075600 | 0.121000 | 1.000000e+00 |
| LIVINGAREA_AVG | 153161.0 | 0.107399 | 0.110565 | 0.000000e+00 | 0.045300 | 0.074500 | 0.129900 | 1.000000e+00 |
| NONLIVINGAPARTMENTS_AVG | 93997.0 | 0.008809 | 0.047732 | 0.000000e+00 | 0.000000 | 0.000000 | 0.003900 | 1.000000e+00 |
| NONLIVINGAREA_AVG | 137829.0 | 0.028358 | 0.069523 | 0.000000e+00 | 0.000000 | 0.003600 | 0.027700 | 1.000000e+00 |
| APARTMENTS_MODE | 151450.0 | 0.114231 | 0.107936 | 0.000000e+00 | 0.052500 | 0.084000 | 0.143900 | 1.000000e+00 |
| BASEMENTAREA_MODE | 127568.0 | 0.087543 | 0.084307 | 0.000000e+00 | 0.040700 | 0.074600 | 0.112400 | 1.000000e+00 |
| YEARS_BEGINEXPLUATATION_MODE | 157504.0 | 0.977065 | 0.064575 | 0.000000e+00 | 0.976700 | 0.981600 | 0.986600 | 1.000000e+00 |
| YEARS_BUILD_MODE | 103023.0 | 0.759637 | 0.110111 | 0.000000e+00 | 0.699400 | 0.764800 | 0.823600 | 1.000000e+00 |
| COMMONAREA_MODE | 92646.0 | 0.042553 | 0.074445 | 0.000000e+00 | 0.007200 | 0.019000 | 0.049000 | 1.000000e+00 |
| ELEVATORS_MODE | 143620.0 | 0.074490 | 0.132256 | 0.000000e+00 | 0.000000 | 0.000000 | 0.120800 | 1.000000e+00 |
| ENTRANCES_MODE | 152683.0 | 0.145193 | 0.100977 | 0.000000e+00 | 0.069000 | 0.137900 | 0.206900 | 1.000000e+00 |
| FLOORSMAX_MODE | 154491.0 | 0.222315 | 0.143709 | 0.000000e+00 | 0.166700 | 0.166700 | 0.333300 | 1.000000e+00 |
| FLOORSMIN_MODE | 98869.0 | 0.228058 | 0.161160 | 0.000000e+00 | 0.083300 | 0.208300 | 0.375000 | 1.000000e+00 |
| LANDAREA_MODE | 124921.0 | 0.064958 | 0.081750 | 0.000000e+00 | 0.016600 | 0.045800 | 0.084100 | 1.000000e+00 |
| LIVINGAPARTMENTS_MODE | 97312.0 | 0.105645 | 0.097880 | 0.000000e+00 | 0.054200 | 0.077100 | 0.131300 | 1.000000e+00 |
| LIVINGAREA_MODE | 153161.0 | 0.105975 | 0.111845 | 0.000000e+00 | 0.042700 | 0.073100 | 0.125200 | 1.000000e+00 |
| NONLIVINGAPARTMENTS_MODE | 93997.0 | 0.008076 | 0.046276 | 0.000000e+00 | 0.000000 | 0.000000 | 0.003900 | 1.000000e+00 |
| NONLIVINGAREA_MODE | 137829.0 | 0.027022 | 0.070254 | 0.000000e+00 | 0.000000 | 0.001100 | 0.023100 | 1.000000e+00 |
| APARTMENTS_MEDI | 151450.0 | 0.117850 | 0.109076 | 0.000000e+00 | 0.058300 | 0.086400 | 0.148900 | 1.000000e+00 |
| BASEMENTAREA_MEDI | 127568.0 | 0.087955 | 0.082179 | 0.000000e+00 | 0.043700 | 0.075800 | 0.111600 | 1.000000e+00 |
| YEARS_BEGINEXPLUATATION_MEDI | 157504.0 | 0.977752 | 0.059897 | 0.000000e+00 | 0.976700 | 0.981600 | 0.986600 | 1.000000e+00 |
| YEARS_BUILD_MEDI | 103023.0 | 0.755746 | 0.112066 | 0.000000e+00 | 0.691400 | 0.758500 | 0.825600 | 1.000000e+00 |
| COMMONAREA_MEDI | 92646.0 | 0.044595 | 0.076144 | 0.000000e+00 | 0.007900 | 0.020800 | 0.051300 | 1.000000e+00 |
| ELEVATORS_MEDI | 143620.0 | 0.078078 | 0.134467 | 0.000000e+00 | 0.000000 | 0.000000 | 0.120000 | 1.000000e+00 |
| ENTRANCES_MEDI | 152683.0 | 0.149213 | 0.100368 | 0.000000e+00 | 0.069000 | 0.137900 | 0.206900 | 1.000000e+00 |
| FLOORSMAX_MEDI | 154491.0 | 0.225897 | 0.145067 | 0.000000e+00 | 0.166700 | 0.166700 | 0.333300 | 1.000000e+00 |
| FLOORSMIN_MEDI | 98869.0 | 0.231625 | 0.161934 | 0.000000e+00 | 0.083300 | 0.208300 | 0.375000 | 1.000000e+00 |
| LANDAREA_MEDI | 124921.0 | 0.067169 | 0.082167 | 0.000000e+00 | 0.018700 | 0.048700 | 0.086800 | 1.000000e+00 |
| LIVINGAPARTMENTS_MEDI | 97312.0 | 0.101954 | 0.093642 | 0.000000e+00 | 0.051300 | 0.076100 | 0.123100 | 1.000000e+00 |
| LIVINGAREA_MEDI | 153161.0 | 0.108607 | 0.112260 | 0.000000e+00 | 0.045700 | 0.074900 | 0.130300 | 1.000000e+00 |
| NONLIVINGAPARTMENTS_MEDI | 93997.0 | 0.008651 | 0.047415 | 0.000000e+00 | 0.000000 | 0.000000 | 0.003900 | 1.000000e+00 |
| NONLIVINGAREA_MEDI | 137829.0 | 0.028236 | 0.070166 | 0.000000e+00 | 0.000000 | 0.003100 | 0.026600 | 1.000000e+00 |
| TOTALAREA_MODE | 159080.0 | 0.102547 | 0.107462 | 0.000000e+00 | 0.041200 | 0.068800 | 0.127600 | 1.000000e+00 |
| OBS_30_CNT_SOCIAL_CIRCLE | 306490.0 | 1.422245 | 2.400989 | 0.000000e+00 | 0.000000 | 0.000000 | 2.000000 | 3.480000e+02 |
| DEF_30_CNT_SOCIAL_CIRCLE | 306490.0 | 0.143421 | 0.446698 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 3.400000e+01 |
| OBS_60_CNT_SOCIAL_CIRCLE | 306490.0 | 1.405292 | 2.379803 | 0.000000e+00 | 0.000000 | 0.000000 | 2.000000 | 3.440000e+02 |
| DEF_60_CNT_SOCIAL_CIRCLE | 306490.0 | 0.100049 | 0.362291 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 2.400000e+01 |
| DAYS_LAST_PHONE_CHANGE | 307510.0 | -962.858788 | 826.808487 | -4.292000e+03 | -1570.000000 | -757.000000 | -274.000000 | 0.000000e+00 |
| FLAG_DOCUMENT_2 | 307511.0 | 0.000042 | 0.006502 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_3 | 307511.0 | 0.710023 | 0.453752 | 0.000000e+00 | 0.000000 | 1.000000 | 1.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_4 | 307511.0 | 0.000081 | 0.009016 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_5 | 307511.0 | 0.015115 | 0.122010 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_6 | 307511.0 | 0.088055 | 0.283376 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_7 | 307511.0 | 0.000192 | 0.013850 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_8 | 307511.0 | 0.081376 | 0.273412 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_9 | 307511.0 | 0.003896 | 0.062295 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_10 | 307511.0 | 0.000023 | 0.004771 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_11 | 307511.0 | 0.003912 | 0.062424 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_12 | 307511.0 | 0.000007 | 0.002550 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_13 | 307511.0 | 0.003525 | 0.059268 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_14 | 307511.0 | 0.002936 | 0.054110 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_15 | 307511.0 | 0.001210 | 0.034760 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_16 | 307511.0 | 0.009928 | 0.099144 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_17 | 307511.0 | 0.000267 | 0.016327 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_18 | 307511.0 | 0.008130 | 0.089798 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_19 | 307511.0 | 0.000595 | 0.024387 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_20 | 307511.0 | 0.000507 | 0.022518 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| FLAG_DOCUMENT_21 | 307511.0 | 0.000335 | 0.018299 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 |
| AMT_REQ_CREDIT_BUREAU_HOUR | 265992.0 | 0.006402 | 0.083849 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 4.000000e+00 |
| AMT_REQ_CREDIT_BUREAU_DAY | 265992.0 | 0.007000 | 0.110757 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 9.000000e+00 |
| AMT_REQ_CREDIT_BUREAU_WEEK | 265992.0 | 0.034362 | 0.204685 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 8.000000e+00 |
| AMT_REQ_CREDIT_BUREAU_MON | 265992.0 | 0.267395 | 0.916002 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 2.700000e+01 |
| AMT_REQ_CREDIT_BUREAU_QRT | 265992.0 | 0.265474 | 0.794056 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 2.610000e+02 |
| AMT_REQ_CREDIT_BUREAU_YEAR | 265992.0 | 1.899974 | 1.869295 | 0.000000e+00 | 0.000000 | 1.000000 | 3.000000 | 2.500000e+01 |
pip install missingno
Requirement already satisfied: missingno in c:\users\thota akhil\anaconda3\lib\site-packages (0.5.2) Requirement already satisfied: numpy in c:\users\thota akhil\anaconda3\lib\site-packages (from missingno) (1.24.3) Requirement already satisfied: matplotlib in c:\users\thota akhil\anaconda3\lib\site-packages (from missingno) (3.7.2) Requirement already satisfied: scipy in c:\users\thota akhil\anaconda3\lib\site-packages (from missingno) (1.11.1) Requirement already satisfied: seaborn in c:\users\thota akhil\anaconda3\lib\site-packages (from missingno) (0.12.2) Requirement already satisfied: contourpy>=1.0.1 in c:\users\thota akhil\anaconda3\lib\site-packages (from matplotlib->missingno) (1.0.5) Requirement already satisfied: cycler>=0.10 in c:\users\thota akhil\anaconda3\lib\site-packages (from matplotlib->missingno) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\thota akhil\anaconda3\lib\site-packages (from matplotlib->missingno) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\thota akhil\anaconda3\lib\site-packages (from matplotlib->missingno) (1.4.4) Requirement already satisfied: packaging>=20.0 in c:\users\thota akhil\anaconda3\lib\site-packages (from matplotlib->missingno) (23.1) Requirement already satisfied: pillow>=6.2.0 in c:\users\thota akhil\anaconda3\lib\site-packages (from matplotlib->missingno) (9.4.0) Requirement already satisfied: pyparsing<3.1,>=2.3.1 in c:\users\thota akhil\anaconda3\lib\site-packages (from matplotlib->missingno) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in c:\users\thota akhil\anaconda3\lib\site-packages (from matplotlib->missingno) (2.8.2) Requirement already satisfied: pandas>=0.25 in c:\users\thota akhil\anaconda3\lib\site-packages (from seaborn->missingno) (2.0.3) Requirement already satisfied: pytz>=2020.1 in c:\users\thota akhil\anaconda3\lib\site-packages (from pandas>=0.25->seaborn->missingno) (2023.3.post1) Requirement already satisfied: tzdata>=2022.1 in c:\users\thota akhil\anaconda3\lib\site-packages (from pandas>=0.25->seaborn->missingno) (2023.3) Requirement already satisfied: six>=1.5 in c:\users\thota akhil\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->missingno) (1.16.0) Note: you may need to restart the kernel to use updated packages.
import missingno as mn
mn.matrix(bank_data)
<Axes: >
mn.bar(bank_data)
<Axes: >
mn.heatmap(bank_data)
<Axes: >
bank_data.isnull().sum()
SK_ID_CURR 0 TARGET 0 NAME_CONTRACT_TYPE 0 CODE_GENDER 0 FLAG_OWN_CAR 0 FLAG_OWN_REALTY 0 CNT_CHILDREN 0 AMT_INCOME_TOTAL 0 AMT_CREDIT 0 AMT_ANNUITY 12 AMT_GOODS_PRICE 278 NAME_TYPE_SUITE 1292 NAME_INCOME_TYPE 0 NAME_EDUCATION_TYPE 0 NAME_FAMILY_STATUS 0 NAME_HOUSING_TYPE 0 REGION_POPULATION_RELATIVE 0 DAYS_BIRTH 0 DAYS_EMPLOYED 0 DAYS_REGISTRATION 0 DAYS_ID_PUBLISH 0 OWN_CAR_AGE 202929 FLAG_MOBIL 0 FLAG_EMP_PHONE 0 FLAG_WORK_PHONE 0 FLAG_CONT_MOBILE 0 FLAG_PHONE 0 FLAG_EMAIL 0 OCCUPATION_TYPE 96391 CNT_FAM_MEMBERS 2 REGION_RATING_CLIENT 0 REGION_RATING_CLIENT_W_CITY 0 WEEKDAY_APPR_PROCESS_START 0 HOUR_APPR_PROCESS_START 0 REG_REGION_NOT_LIVE_REGION 0 REG_REGION_NOT_WORK_REGION 0 LIVE_REGION_NOT_WORK_REGION 0 REG_CITY_NOT_LIVE_CITY 0 REG_CITY_NOT_WORK_CITY 0 LIVE_CITY_NOT_WORK_CITY 0 ORGANIZATION_TYPE 0 EXT_SOURCE_1 173378 EXT_SOURCE_2 660 EXT_SOURCE_3 60965 APARTMENTS_AVG 156061 BASEMENTAREA_AVG 179943 YEARS_BEGINEXPLUATATION_AVG 150007 YEARS_BUILD_AVG 204488 COMMONAREA_AVG 214865 ELEVATORS_AVG 163891 ENTRANCES_AVG 154828 FLOORSMAX_AVG 153020 FLOORSMIN_AVG 208642 LANDAREA_AVG 182590 LIVINGAPARTMENTS_AVG 210199 LIVINGAREA_AVG 154350 NONLIVINGAPARTMENTS_AVG 213514 NONLIVINGAREA_AVG 169682 APARTMENTS_MODE 156061 BASEMENTAREA_MODE 179943 YEARS_BEGINEXPLUATATION_MODE 150007 YEARS_BUILD_MODE 204488 COMMONAREA_MODE 214865 ELEVATORS_MODE 163891 ENTRANCES_MODE 154828 FLOORSMAX_MODE 153020 FLOORSMIN_MODE 208642 LANDAREA_MODE 182590 LIVINGAPARTMENTS_MODE 210199 LIVINGAREA_MODE 154350 NONLIVINGAPARTMENTS_MODE 213514 NONLIVINGAREA_MODE 169682 APARTMENTS_MEDI 156061 BASEMENTAREA_MEDI 179943 YEARS_BEGINEXPLUATATION_MEDI 150007 YEARS_BUILD_MEDI 204488 COMMONAREA_MEDI 214865 ELEVATORS_MEDI 163891 ENTRANCES_MEDI 154828 FLOORSMAX_MEDI 153020 FLOORSMIN_MEDI 208642 LANDAREA_MEDI 182590 LIVINGAPARTMENTS_MEDI 210199 LIVINGAREA_MEDI 154350 NONLIVINGAPARTMENTS_MEDI 213514 NONLIVINGAREA_MEDI 169682 FONDKAPREMONT_MODE 210295 HOUSETYPE_MODE 154297 TOTALAREA_MODE 148431 WALLSMATERIAL_MODE 156341 EMERGENCYSTATE_MODE 145755 OBS_30_CNT_SOCIAL_CIRCLE 1021 DEF_30_CNT_SOCIAL_CIRCLE 1021 OBS_60_CNT_SOCIAL_CIRCLE 1021 DEF_60_CNT_SOCIAL_CIRCLE 1021 DAYS_LAST_PHONE_CHANGE 1 FLAG_DOCUMENT_2 0 FLAG_DOCUMENT_3 0 FLAG_DOCUMENT_4 0 FLAG_DOCUMENT_5 0 FLAG_DOCUMENT_6 0 FLAG_DOCUMENT_7 0 FLAG_DOCUMENT_8 0 FLAG_DOCUMENT_9 0 FLAG_DOCUMENT_10 0 FLAG_DOCUMENT_11 0 FLAG_DOCUMENT_12 0 FLAG_DOCUMENT_13 0 FLAG_DOCUMENT_14 0 FLAG_DOCUMENT_15 0 FLAG_DOCUMENT_16 0 FLAG_DOCUMENT_17 0 FLAG_DOCUMENT_18 0 FLAG_DOCUMENT_19 0 FLAG_DOCUMENT_20 0 FLAG_DOCUMENT_21 0 AMT_REQ_CREDIT_BUREAU_HOUR 41519 AMT_REQ_CREDIT_BUREAU_DAY 41519 AMT_REQ_CREDIT_BUREAU_WEEK 41519 AMT_REQ_CREDIT_BUREAU_MON 41519 AMT_REQ_CREDIT_BUREAU_QRT 41519 AMT_REQ_CREDIT_BUREAU_YEAR 41519 dtype: int64
round(bank_data.isnull().sum() / bank_data.shape[0] * 100.00,2)
SK_ID_CURR 0.00 TARGET 0.00 NAME_CONTRACT_TYPE 0.00 CODE_GENDER 0.00 FLAG_OWN_CAR 0.00 FLAG_OWN_REALTY 0.00 CNT_CHILDREN 0.00 AMT_INCOME_TOTAL 0.00 AMT_CREDIT 0.00 AMT_ANNUITY 0.00 AMT_GOODS_PRICE 0.09 NAME_TYPE_SUITE 0.42 NAME_INCOME_TYPE 0.00 NAME_EDUCATION_TYPE 0.00 NAME_FAMILY_STATUS 0.00 NAME_HOUSING_TYPE 0.00 REGION_POPULATION_RELATIVE 0.00 DAYS_BIRTH 0.00 DAYS_EMPLOYED 0.00 DAYS_REGISTRATION 0.00 DAYS_ID_PUBLISH 0.00 OWN_CAR_AGE 65.99 FLAG_MOBIL 0.00 FLAG_EMP_PHONE 0.00 FLAG_WORK_PHONE 0.00 FLAG_CONT_MOBILE 0.00 FLAG_PHONE 0.00 FLAG_EMAIL 0.00 OCCUPATION_TYPE 31.35 CNT_FAM_MEMBERS 0.00 REGION_RATING_CLIENT 0.00 REGION_RATING_CLIENT_W_CITY 0.00 WEEKDAY_APPR_PROCESS_START 0.00 HOUR_APPR_PROCESS_START 0.00 REG_REGION_NOT_LIVE_REGION 0.00 REG_REGION_NOT_WORK_REGION 0.00 LIVE_REGION_NOT_WORK_REGION 0.00 REG_CITY_NOT_LIVE_CITY 0.00 REG_CITY_NOT_WORK_CITY 0.00 LIVE_CITY_NOT_WORK_CITY 0.00 ORGANIZATION_TYPE 0.00 EXT_SOURCE_1 56.38 EXT_SOURCE_2 0.21 EXT_SOURCE_3 19.83 APARTMENTS_AVG 50.75 BASEMENTAREA_AVG 58.52 YEARS_BEGINEXPLUATATION_AVG 48.78 YEARS_BUILD_AVG 66.50 COMMONAREA_AVG 69.87 ELEVATORS_AVG 53.30 ENTRANCES_AVG 50.35 FLOORSMAX_AVG 49.76 FLOORSMIN_AVG 67.85 LANDAREA_AVG 59.38 LIVINGAPARTMENTS_AVG 68.35 LIVINGAREA_AVG 50.19 NONLIVINGAPARTMENTS_AVG 69.43 NONLIVINGAREA_AVG 55.18 APARTMENTS_MODE 50.75 BASEMENTAREA_MODE 58.52 YEARS_BEGINEXPLUATATION_MODE 48.78 YEARS_BUILD_MODE 66.50 COMMONAREA_MODE 69.87 ELEVATORS_MODE 53.30 ENTRANCES_MODE 50.35 FLOORSMAX_MODE 49.76 FLOORSMIN_MODE 67.85 LANDAREA_MODE 59.38 LIVINGAPARTMENTS_MODE 68.35 LIVINGAREA_MODE 50.19 NONLIVINGAPARTMENTS_MODE 69.43 NONLIVINGAREA_MODE 55.18 APARTMENTS_MEDI 50.75 BASEMENTAREA_MEDI 58.52 YEARS_BEGINEXPLUATATION_MEDI 48.78 YEARS_BUILD_MEDI 66.50 COMMONAREA_MEDI 69.87 ELEVATORS_MEDI 53.30 ENTRANCES_MEDI 50.35 FLOORSMAX_MEDI 49.76 FLOORSMIN_MEDI 67.85 LANDAREA_MEDI 59.38 LIVINGAPARTMENTS_MEDI 68.35 LIVINGAREA_MEDI 50.19 NONLIVINGAPARTMENTS_MEDI 69.43 NONLIVINGAREA_MEDI 55.18 FONDKAPREMONT_MODE 68.39 HOUSETYPE_MODE 50.18 TOTALAREA_MODE 48.27 WALLSMATERIAL_MODE 50.84 EMERGENCYSTATE_MODE 47.40 OBS_30_CNT_SOCIAL_CIRCLE 0.33 DEF_30_CNT_SOCIAL_CIRCLE 0.33 OBS_60_CNT_SOCIAL_CIRCLE 0.33 DEF_60_CNT_SOCIAL_CIRCLE 0.33 DAYS_LAST_PHONE_CHANGE 0.00 FLAG_DOCUMENT_2 0.00 FLAG_DOCUMENT_3 0.00 FLAG_DOCUMENT_4 0.00 FLAG_DOCUMENT_5 0.00 FLAG_DOCUMENT_6 0.00 FLAG_DOCUMENT_7 0.00 FLAG_DOCUMENT_8 0.00 FLAG_DOCUMENT_9 0.00 FLAG_DOCUMENT_10 0.00 FLAG_DOCUMENT_11 0.00 FLAG_DOCUMENT_12 0.00 FLAG_DOCUMENT_13 0.00 FLAG_DOCUMENT_14 0.00 FLAG_DOCUMENT_15 0.00 FLAG_DOCUMENT_16 0.00 FLAG_DOCUMENT_17 0.00 FLAG_DOCUMENT_18 0.00 FLAG_DOCUMENT_19 0.00 FLAG_DOCUMENT_20 0.00 FLAG_DOCUMENT_21 0.00 AMT_REQ_CREDIT_BUREAU_HOUR 13.50 AMT_REQ_CREDIT_BUREAU_DAY 13.50 AMT_REQ_CREDIT_BUREAU_WEEK 13.50 AMT_REQ_CREDIT_BUREAU_MON 13.50 AMT_REQ_CREDIT_BUREAU_QRT 13.50 AMT_REQ_CREDIT_BUREAU_YEAR 13.50 dtype: float64
null_bank_data = pd.DataFrame((bank_data.isnull().sum())*100/bank_data.shape[0]).reset_index()
null_bank_data.columns = ['Column Name','Null Values Percentage']
ax = sns.pointplot(data =null_bank_data, x='Column Name', y='Null Values Percentage', color = 'red')
fig = plt.figure(figsize=(30,15))
<Figure size 3000x1500 with 0 Axes>
ax = sns.pointplot(data = null_bank_data, x='Column Name', y='Null Values Percentage', color = 'red')
fig = plt.figure(figsize=(30,15))
ax.axhline(40, ls='--',color='green')
<matplotlib.lines.Line2D at 0x1f1984cdad0>
<Figure size 3000x1500 with 0 Axes>
null_bank_data = pd.DataFrame((bank_data.isnull().sum())*100/bank_data.shape[0]).reset_index()
null_bank_data.columns = ['Column Name', 'Null Values Percentage']
fig = plt.figure(figsize=(18,6))
ax = sns.pointplot(x="Column Name",y="Null Values Percentage",data=null_bank_data,color='blue')
plt.xticks(rotation =90,fontsize =7)
ax.axhline(40, ls='--',color='red')
plt.title("Percentage of Missing values 40%")
plt.ylabel("Null Values PERCENTAGE")
plt.xlabel("COLUMNS")
plt.show()
null_40 = null_bank_data[null_bank_data["Null Values Percentage"]>=40]
null_40
| Column Name | Null Values Percentage | |
|---|---|---|
| 21 | OWN_CAR_AGE | 65.990810 |
| 41 | EXT_SOURCE_1 | 56.381073 |
| 44 | APARTMENTS_AVG | 50.749729 |
| 45 | BASEMENTAREA_AVG | 58.515956 |
| 46 | YEARS_BEGINEXPLUATATION_AVG | 48.781019 |
| 47 | YEARS_BUILD_AVG | 66.497784 |
| 48 | COMMONAREA_AVG | 69.872297 |
| 49 | ELEVATORS_AVG | 53.295980 |
| 50 | ENTRANCES_AVG | 50.348768 |
| 51 | FLOORSMAX_AVG | 49.760822 |
| 52 | FLOORSMIN_AVG | 67.848630 |
| 53 | LANDAREA_AVG | 59.376738 |
| 54 | LIVINGAPARTMENTS_AVG | 68.354953 |
| 55 | LIVINGAREA_AVG | 50.193326 |
| 56 | NONLIVINGAPARTMENTS_AVG | 69.432963 |
| 57 | NONLIVINGAREA_AVG | 55.179164 |
| 58 | APARTMENTS_MODE | 50.749729 |
| 59 | BASEMENTAREA_MODE | 58.515956 |
| 60 | YEARS_BEGINEXPLUATATION_MODE | 48.781019 |
| 61 | YEARS_BUILD_MODE | 66.497784 |
| 62 | COMMONAREA_MODE | 69.872297 |
| 63 | ELEVATORS_MODE | 53.295980 |
| 64 | ENTRANCES_MODE | 50.348768 |
| 65 | FLOORSMAX_MODE | 49.760822 |
| 66 | FLOORSMIN_MODE | 67.848630 |
| 67 | LANDAREA_MODE | 59.376738 |
| 68 | LIVINGAPARTMENTS_MODE | 68.354953 |
| 69 | LIVINGAREA_MODE | 50.193326 |
| 70 | NONLIVINGAPARTMENTS_MODE | 69.432963 |
| 71 | NONLIVINGAREA_MODE | 55.179164 |
| 72 | APARTMENTS_MEDI | 50.749729 |
| 73 | BASEMENTAREA_MEDI | 58.515956 |
| 74 | YEARS_BEGINEXPLUATATION_MEDI | 48.781019 |
| 75 | YEARS_BUILD_MEDI | 66.497784 |
| 76 | COMMONAREA_MEDI | 69.872297 |
| 77 | ELEVATORS_MEDI | 53.295980 |
| 78 | ENTRANCES_MEDI | 50.348768 |
| 79 | FLOORSMAX_MEDI | 49.760822 |
| 80 | FLOORSMIN_MEDI | 67.848630 |
| 81 | LANDAREA_MEDI | 59.376738 |
| 82 | LIVINGAPARTMENTS_MEDI | 68.354953 |
| 83 | LIVINGAREA_MEDI | 50.193326 |
| 84 | NONLIVINGAPARTMENTS_MEDI | 69.432963 |
| 85 | NONLIVINGAREA_MEDI | 55.179164 |
| 86 | FONDKAPREMONT_MODE | 68.386172 |
| 87 | HOUSETYPE_MODE | 50.176091 |
| 88 | TOTALAREA_MODE | 48.268517 |
| 89 | WALLSMATERIAL_MODE | 50.840783 |
| 90 | EMERGENCYSTATE_MODE | 47.398304 |
len(null_40)
49
previous_data = pd.read_csv(r"C:\Users\THOTA AKHIL\Downloads\PROJECTS\RESUME PROJECTS\BANK PROJECT\19th BANK PROJECT\previous_application.csv\previous_application.csv")
previous_data
| SK_ID_PREV | SK_ID_CURR | NAME_CONTRACT_TYPE | AMT_ANNUITY | AMT_APPLICATION | AMT_CREDIT | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | FLAG_LAST_APPL_PER_CONTRACT | NFLAG_LAST_APPL_IN_DAY | RATE_DOWN_PAYMENT | RATE_INTEREST_PRIMARY | RATE_INTEREST_PRIVILEGED | NAME_CASH_LOAN_PURPOSE | NAME_CONTRACT_STATUS | DAYS_DECISION | NAME_PAYMENT_TYPE | CODE_REJECT_REASON | NAME_TYPE_SUITE | NAME_CLIENT_TYPE | NAME_GOODS_CATEGORY | NAME_PORTFOLIO | NAME_PRODUCT_TYPE | CHANNEL_TYPE | SELLERPLACE_AREA | NAME_SELLER_INDUSTRY | CNT_PAYMENT | NAME_YIELD_GROUP | PRODUCT_COMBINATION | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | NFLAG_INSURED_ON_APPROVAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2030495 | 271877 | Consumer loans | 1730.430 | 17145.0 | 17145.0 | 0.0 | 17145.0 | SATURDAY | 15 | Y | 1 | 0.000000 | 0.182832 | 0.867336 | XAP | Approved | -73 | Cash through the bank | XAP | NaN | Repeater | Mobile | POS | XNA | Country-wide | 35 | Connectivity | 12.0 | middle | POS mobile with interest | 365243.0 | -42.0 | 300.0 | -42.0 | -37.0 | 0.0 |
| 1 | 2802425 | 108129 | Cash loans | 25188.615 | 607500.0 | 679671.0 | NaN | 607500.0 | THURSDAY | 11 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -164 | XNA | XAP | Unaccompanied | Repeater | XNA | Cash | x-sell | Contact center | -1 | XNA | 36.0 | low_action | Cash X-Sell: low | 365243.0 | -134.0 | 916.0 | 365243.0 | 365243.0 | 1.0 |
| 2 | 2523466 | 122040 | Cash loans | 15060.735 | 112500.0 | 136444.5 | NaN | 112500.0 | TUESDAY | 11 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -301 | Cash through the bank | XAP | Spouse, partner | Repeater | XNA | Cash | x-sell | Credit and cash offices | -1 | XNA | 12.0 | high | Cash X-Sell: high | 365243.0 | -271.0 | 59.0 | 365243.0 | 365243.0 | 1.0 |
| 3 | 2819243 | 176158 | Cash loans | 47041.335 | 450000.0 | 470790.0 | NaN | 450000.0 | MONDAY | 7 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -512 | Cash through the bank | XAP | NaN | Repeater | XNA | Cash | x-sell | Credit and cash offices | -1 | XNA | 12.0 | middle | Cash X-Sell: middle | 365243.0 | -482.0 | -152.0 | -182.0 | -177.0 | 1.0 |
| 4 | 1784265 | 202054 | Cash loans | 31924.395 | 337500.0 | 404055.0 | NaN | 337500.0 | THURSDAY | 9 | Y | 1 | NaN | NaN | NaN | Repairs | Refused | -781 | Cash through the bank | HC | NaN | Repeater | XNA | Cash | walk-in | Credit and cash offices | -1 | XNA | 24.0 | high | Cash Street: high | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1670209 | 2300464 | 352015 | Consumer loans | 14704.290 | 267295.5 | 311400.0 | 0.0 | 267295.5 | WEDNESDAY | 12 | Y | 1 | 0.000000 | NaN | NaN | XAP | Approved | -544 | Cash through the bank | XAP | NaN | Refreshed | Furniture | POS | XNA | Stone | 43 | Furniture | 30.0 | low_normal | POS industry with interest | 365243.0 | -508.0 | 362.0 | -358.0 | -351.0 | 0.0 |
| 1670210 | 2357031 | 334635 | Consumer loans | 6622.020 | 87750.0 | 64291.5 | 29250.0 | 87750.0 | TUESDAY | 15 | Y | 1 | 0.340554 | NaN | NaN | XAP | Approved | -1694 | Cash through the bank | XAP | Unaccompanied | New | Furniture | POS | XNA | Stone | 43 | Furniture | 12.0 | middle | POS industry with interest | 365243.0 | -1604.0 | -1274.0 | -1304.0 | -1297.0 | 0.0 |
| 1670211 | 2659632 | 249544 | Consumer loans | 11520.855 | 105237.0 | 102523.5 | 10525.5 | 105237.0 | MONDAY | 12 | Y | 1 | 0.101401 | NaN | NaN | XAP | Approved | -1488 | Cash through the bank | XAP | Spouse, partner | Repeater | Consumer Electronics | POS | XNA | Country-wide | 1370 | Consumer electronics | 10.0 | low_normal | POS household with interest | 365243.0 | -1457.0 | -1187.0 | -1187.0 | -1181.0 | 0.0 |
| 1670212 | 2785582 | 400317 | Cash loans | 18821.520 | 180000.0 | 191880.0 | NaN | 180000.0 | WEDNESDAY | 9 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -1185 | Cash through the bank | XAP | Family | Repeater | XNA | Cash | x-sell | AP+ (Cash loan) | -1 | XNA | 12.0 | low_normal | Cash X-Sell: low | 365243.0 | -1155.0 | -825.0 | -825.0 | -817.0 | 1.0 |
| 1670213 | 2418762 | 261212 | Cash loans | 16431.300 | 360000.0 | 360000.0 | NaN | 360000.0 | SUNDAY | 10 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -1193 | Cash through the bank | XAP | Family | Repeater | XNA | Cash | x-sell | AP+ (Cash loan) | -1 | XNA | 48.0 | middle | Cash X-Sell: middle | 365243.0 | -1163.0 | 247.0 | -443.0 | -423.0 | 0.0 |
1670214 rows × 37 columns
previous_data.head()
| SK_ID_PREV | SK_ID_CURR | NAME_CONTRACT_TYPE | AMT_ANNUITY | AMT_APPLICATION | AMT_CREDIT | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | FLAG_LAST_APPL_PER_CONTRACT | NFLAG_LAST_APPL_IN_DAY | RATE_DOWN_PAYMENT | RATE_INTEREST_PRIMARY | RATE_INTEREST_PRIVILEGED | NAME_CASH_LOAN_PURPOSE | NAME_CONTRACT_STATUS | DAYS_DECISION | NAME_PAYMENT_TYPE | CODE_REJECT_REASON | NAME_TYPE_SUITE | NAME_CLIENT_TYPE | NAME_GOODS_CATEGORY | NAME_PORTFOLIO | NAME_PRODUCT_TYPE | CHANNEL_TYPE | SELLERPLACE_AREA | NAME_SELLER_INDUSTRY | CNT_PAYMENT | NAME_YIELD_GROUP | PRODUCT_COMBINATION | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | NFLAG_INSURED_ON_APPROVAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2030495 | 271877 | Consumer loans | 1730.430 | 17145.0 | 17145.0 | 0.0 | 17145.0 | SATURDAY | 15 | Y | 1 | 0.0 | 0.182832 | 0.867336 | XAP | Approved | -73 | Cash through the bank | XAP | NaN | Repeater | Mobile | POS | XNA | Country-wide | 35 | Connectivity | 12.0 | middle | POS mobile with interest | 365243.0 | -42.0 | 300.0 | -42.0 | -37.0 | 0.0 |
| 1 | 2802425 | 108129 | Cash loans | 25188.615 | 607500.0 | 679671.0 | NaN | 607500.0 | THURSDAY | 11 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -164 | XNA | XAP | Unaccompanied | Repeater | XNA | Cash | x-sell | Contact center | -1 | XNA | 36.0 | low_action | Cash X-Sell: low | 365243.0 | -134.0 | 916.0 | 365243.0 | 365243.0 | 1.0 |
| 2 | 2523466 | 122040 | Cash loans | 15060.735 | 112500.0 | 136444.5 | NaN | 112500.0 | TUESDAY | 11 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -301 | Cash through the bank | XAP | Spouse, partner | Repeater | XNA | Cash | x-sell | Credit and cash offices | -1 | XNA | 12.0 | high | Cash X-Sell: high | 365243.0 | -271.0 | 59.0 | 365243.0 | 365243.0 | 1.0 |
| 3 | 2819243 | 176158 | Cash loans | 47041.335 | 450000.0 | 470790.0 | NaN | 450000.0 | MONDAY | 7 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -512 | Cash through the bank | XAP | NaN | Repeater | XNA | Cash | x-sell | Credit and cash offices | -1 | XNA | 12.0 | middle | Cash X-Sell: middle | 365243.0 | -482.0 | -152.0 | -182.0 | -177.0 | 1.0 |
| 4 | 1784265 | 202054 | Cash loans | 31924.395 | 337500.0 | 404055.0 | NaN | 337500.0 | THURSDAY | 9 | Y | 1 | NaN | NaN | NaN | Repairs | Refused | -781 | Cash through the bank | HC | NaN | Repeater | XNA | Cash | walk-in | Credit and cash offices | -1 | XNA | 24.0 | high | Cash Street: high | NaN | NaN | NaN | NaN | NaN | NaN |
previous_data.tail()
| SK_ID_PREV | SK_ID_CURR | NAME_CONTRACT_TYPE | AMT_ANNUITY | AMT_APPLICATION | AMT_CREDIT | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | FLAG_LAST_APPL_PER_CONTRACT | NFLAG_LAST_APPL_IN_DAY | RATE_DOWN_PAYMENT | RATE_INTEREST_PRIMARY | RATE_INTEREST_PRIVILEGED | NAME_CASH_LOAN_PURPOSE | NAME_CONTRACT_STATUS | DAYS_DECISION | NAME_PAYMENT_TYPE | CODE_REJECT_REASON | NAME_TYPE_SUITE | NAME_CLIENT_TYPE | NAME_GOODS_CATEGORY | NAME_PORTFOLIO | NAME_PRODUCT_TYPE | CHANNEL_TYPE | SELLERPLACE_AREA | NAME_SELLER_INDUSTRY | CNT_PAYMENT | NAME_YIELD_GROUP | PRODUCT_COMBINATION | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | NFLAG_INSURED_ON_APPROVAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1670209 | 2300464 | 352015 | Consumer loans | 14704.290 | 267295.5 | 311400.0 | 0.0 | 267295.5 | WEDNESDAY | 12 | Y | 1 | 0.000000 | NaN | NaN | XAP | Approved | -544 | Cash through the bank | XAP | NaN | Refreshed | Furniture | POS | XNA | Stone | 43 | Furniture | 30.0 | low_normal | POS industry with interest | 365243.0 | -508.0 | 362.0 | -358.0 | -351.0 | 0.0 |
| 1670210 | 2357031 | 334635 | Consumer loans | 6622.020 | 87750.0 | 64291.5 | 29250.0 | 87750.0 | TUESDAY | 15 | Y | 1 | 0.340554 | NaN | NaN | XAP | Approved | -1694 | Cash through the bank | XAP | Unaccompanied | New | Furniture | POS | XNA | Stone | 43 | Furniture | 12.0 | middle | POS industry with interest | 365243.0 | -1604.0 | -1274.0 | -1304.0 | -1297.0 | 0.0 |
| 1670211 | 2659632 | 249544 | Consumer loans | 11520.855 | 105237.0 | 102523.5 | 10525.5 | 105237.0 | MONDAY | 12 | Y | 1 | 0.101401 | NaN | NaN | XAP | Approved | -1488 | Cash through the bank | XAP | Spouse, partner | Repeater | Consumer Electronics | POS | XNA | Country-wide | 1370 | Consumer electronics | 10.0 | low_normal | POS household with interest | 365243.0 | -1457.0 | -1187.0 | -1187.0 | -1181.0 | 0.0 |
| 1670212 | 2785582 | 400317 | Cash loans | 18821.520 | 180000.0 | 191880.0 | NaN | 180000.0 | WEDNESDAY | 9 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -1185 | Cash through the bank | XAP | Family | Repeater | XNA | Cash | x-sell | AP+ (Cash loan) | -1 | XNA | 12.0 | low_normal | Cash X-Sell: low | 365243.0 | -1155.0 | -825.0 | -825.0 | -817.0 | 1.0 |
| 1670213 | 2418762 | 261212 | Cash loans | 16431.300 | 360000.0 | 360000.0 | NaN | 360000.0 | SUNDAY | 10 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -1193 | Cash through the bank | XAP | Family | Repeater | XNA | Cash | x-sell | AP+ (Cash loan) | -1 | XNA | 48.0 | middle | Cash X-Sell: middle | 365243.0 | -1163.0 | 247.0 | -443.0 | -423.0 | 0.0 |
previous_data.shape
(1670214, 37)
previous_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1670214 entries, 0 to 1670213 Data columns (total 37 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_PREV 1670214 non-null int64 1 SK_ID_CURR 1670214 non-null int64 2 NAME_CONTRACT_TYPE 1670214 non-null object 3 AMT_ANNUITY 1297979 non-null float64 4 AMT_APPLICATION 1670214 non-null float64 5 AMT_CREDIT 1670213 non-null float64 6 AMT_DOWN_PAYMENT 774370 non-null float64 7 AMT_GOODS_PRICE 1284699 non-null float64 8 WEEKDAY_APPR_PROCESS_START 1670214 non-null object 9 HOUR_APPR_PROCESS_START 1670214 non-null int64 10 FLAG_LAST_APPL_PER_CONTRACT 1670214 non-null object 11 NFLAG_LAST_APPL_IN_DAY 1670214 non-null int64 12 RATE_DOWN_PAYMENT 774370 non-null float64 13 RATE_INTEREST_PRIMARY 5951 non-null float64 14 RATE_INTEREST_PRIVILEGED 5951 non-null float64 15 NAME_CASH_LOAN_PURPOSE 1670214 non-null object 16 NAME_CONTRACT_STATUS 1670214 non-null object 17 DAYS_DECISION 1670214 non-null int64 18 NAME_PAYMENT_TYPE 1670214 non-null object 19 CODE_REJECT_REASON 1670214 non-null object 20 NAME_TYPE_SUITE 849809 non-null object 21 NAME_CLIENT_TYPE 1670214 non-null object 22 NAME_GOODS_CATEGORY 1670214 non-null object 23 NAME_PORTFOLIO 1670214 non-null object 24 NAME_PRODUCT_TYPE 1670214 non-null object 25 CHANNEL_TYPE 1670214 non-null object 26 SELLERPLACE_AREA 1670214 non-null int64 27 NAME_SELLER_INDUSTRY 1670214 non-null object 28 CNT_PAYMENT 1297984 non-null float64 29 NAME_YIELD_GROUP 1670214 non-null object 30 PRODUCT_COMBINATION 1669868 non-null object 31 DAYS_FIRST_DRAWING 997149 non-null float64 32 DAYS_FIRST_DUE 997149 non-null float64 33 DAYS_LAST_DUE_1ST_VERSION 997149 non-null float64 34 DAYS_LAST_DUE 997149 non-null float64 35 DAYS_TERMINATION 997149 non-null float64 36 NFLAG_INSURED_ON_APPROVAL 997149 non-null float64 dtypes: float64(15), int64(6), object(16) memory usage: 471.5+ MB
previous_data.isnull().sum()
SK_ID_PREV 0 SK_ID_CURR 0 NAME_CONTRACT_TYPE 0 AMT_ANNUITY 372235 AMT_APPLICATION 0 AMT_CREDIT 1 AMT_DOWN_PAYMENT 895844 AMT_GOODS_PRICE 385515 WEEKDAY_APPR_PROCESS_START 0 HOUR_APPR_PROCESS_START 0 FLAG_LAST_APPL_PER_CONTRACT 0 NFLAG_LAST_APPL_IN_DAY 0 RATE_DOWN_PAYMENT 895844 RATE_INTEREST_PRIMARY 1664263 RATE_INTEREST_PRIVILEGED 1664263 NAME_CASH_LOAN_PURPOSE 0 NAME_CONTRACT_STATUS 0 DAYS_DECISION 0 NAME_PAYMENT_TYPE 0 CODE_REJECT_REASON 0 NAME_TYPE_SUITE 820405 NAME_CLIENT_TYPE 0 NAME_GOODS_CATEGORY 0 NAME_PORTFOLIO 0 NAME_PRODUCT_TYPE 0 CHANNEL_TYPE 0 SELLERPLACE_AREA 0 NAME_SELLER_INDUSTRY 0 CNT_PAYMENT 372230 NAME_YIELD_GROUP 0 PRODUCT_COMBINATION 346 DAYS_FIRST_DRAWING 673065 DAYS_FIRST_DUE 673065 DAYS_LAST_DUE_1ST_VERSION 673065 DAYS_LAST_DUE 673065 DAYS_TERMINATION 673065 NFLAG_INSURED_ON_APPROVAL 673065 dtype: int64
previous_data.describe()
| SK_ID_PREV | SK_ID_CURR | AMT_ANNUITY | AMT_APPLICATION | AMT_CREDIT | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE | HOUR_APPR_PROCESS_START | NFLAG_LAST_APPL_IN_DAY | RATE_DOWN_PAYMENT | RATE_INTEREST_PRIMARY | RATE_INTEREST_PRIVILEGED | DAYS_DECISION | SELLERPLACE_AREA | CNT_PAYMENT | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | NFLAG_INSURED_ON_APPROVAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.670214e+06 | 1.670214e+06 | 1.297979e+06 | 1.670214e+06 | 1.670213e+06 | 7.743700e+05 | 1.284699e+06 | 1.670214e+06 | 1.670214e+06 | 774370.000000 | 5951.000000 | 5951.000000 | 1.670214e+06 | 1.670214e+06 | 1.297984e+06 | 997149.000000 | 997149.000000 | 997149.000000 | 997149.000000 | 997149.000000 | 997149.000000 |
| mean | 1.923089e+06 | 2.783572e+05 | 1.595512e+04 | 1.752339e+05 | 1.961140e+05 | 6.697402e+03 | 2.278473e+05 | 1.248418e+01 | 9.964675e-01 | 0.079637 | 0.188357 | 0.773503 | -8.806797e+02 | 3.139511e+02 | 1.605408e+01 | 342209.855039 | 13826.269337 | 33767.774054 | 76582.403064 | 81992.343838 | 0.332570 |
| std | 5.325980e+05 | 1.028148e+05 | 1.478214e+04 | 2.927798e+05 | 3.185746e+05 | 2.092150e+04 | 3.153966e+05 | 3.334028e+00 | 5.932963e-02 | 0.107823 | 0.087671 | 0.100879 | 7.790997e+02 | 7.127443e+03 | 1.456729e+01 | 88916.115834 | 72444.869708 | 106857.034789 | 149647.415123 | 153303.516729 | 0.471134 |
| min | 1.000001e+06 | 1.000010e+05 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | -9.000000e-01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | -0.000015 | 0.034781 | 0.373150 | -2.922000e+03 | -1.000000e+00 | 0.000000e+00 | -2922.000000 | -2892.000000 | -2801.000000 | -2889.000000 | -2874.000000 | 0.000000 |
| 25% | 1.461857e+06 | 1.893290e+05 | 6.321780e+03 | 1.872000e+04 | 2.416050e+04 | 0.000000e+00 | 5.084100e+04 | 1.000000e+01 | 1.000000e+00 | 0.000000 | 0.160716 | 0.715645 | -1.300000e+03 | -1.000000e+00 | 6.000000e+00 | 365243.000000 | -1628.000000 | -1242.000000 | -1314.000000 | -1270.000000 | 0.000000 |
| 50% | 1.923110e+06 | 2.787145e+05 | 1.125000e+04 | 7.104600e+04 | 8.054100e+04 | 1.638000e+03 | 1.123200e+05 | 1.200000e+01 | 1.000000e+00 | 0.051605 | 0.189122 | 0.835095 | -5.810000e+02 | 3.000000e+00 | 1.200000e+01 | 365243.000000 | -831.000000 | -361.000000 | -537.000000 | -499.000000 | 0.000000 |
| 75% | 2.384280e+06 | 3.675140e+05 | 2.065842e+04 | 1.803600e+05 | 2.164185e+05 | 7.740000e+03 | 2.340000e+05 | 1.500000e+01 | 1.000000e+00 | 0.108909 | 0.193330 | 0.852537 | -2.800000e+02 | 8.200000e+01 | 2.400000e+01 | 365243.000000 | -411.000000 | 129.000000 | -74.000000 | -44.000000 | 1.000000 |
| max | 2.845382e+06 | 4.562550e+05 | 4.180581e+05 | 6.905160e+06 | 6.905160e+06 | 3.060045e+06 | 6.905160e+06 | 2.300000e+01 | 1.000000e+00 | 1.000000 | 1.000000 | 1.000000 | -1.000000e+00 | 4.000000e+06 | 8.400000e+01 | 365243.000000 | 365243.000000 | 365243.000000 | 365243.000000 | 365243.000000 | 1.000000 |
mn.bar(previous_data)
<Axes: >
mn.matrix(previous_data)
<Axes: >
round(previous_data.isnull().sum()/previous_data.shape[0] * 100.00,2)
SK_ID_PREV 0.00 SK_ID_CURR 0.00 NAME_CONTRACT_TYPE 0.00 AMT_ANNUITY 22.29 AMT_APPLICATION 0.00 AMT_CREDIT 0.00 AMT_DOWN_PAYMENT 53.64 AMT_GOODS_PRICE 23.08 WEEKDAY_APPR_PROCESS_START 0.00 HOUR_APPR_PROCESS_START 0.00 FLAG_LAST_APPL_PER_CONTRACT 0.00 NFLAG_LAST_APPL_IN_DAY 0.00 RATE_DOWN_PAYMENT 53.64 RATE_INTEREST_PRIMARY 99.64 RATE_INTEREST_PRIVILEGED 99.64 NAME_CASH_LOAN_PURPOSE 0.00 NAME_CONTRACT_STATUS 0.00 DAYS_DECISION 0.00 NAME_PAYMENT_TYPE 0.00 CODE_REJECT_REASON 0.00 NAME_TYPE_SUITE 49.12 NAME_CLIENT_TYPE 0.00 NAME_GOODS_CATEGORY 0.00 NAME_PORTFOLIO 0.00 NAME_PRODUCT_TYPE 0.00 CHANNEL_TYPE 0.00 SELLERPLACE_AREA 0.00 NAME_SELLER_INDUSTRY 0.00 CNT_PAYMENT 22.29 NAME_YIELD_GROUP 0.00 PRODUCT_COMBINATION 0.02 DAYS_FIRST_DRAWING 40.30 DAYS_FIRST_DUE 40.30 DAYS_LAST_DUE_1ST_VERSION 40.30 DAYS_LAST_DUE 40.30 DAYS_TERMINATION 40.30 NFLAG_INSURED_ON_APPROVAL 40.30 dtype: float64
null_previous_data = pd.DataFrame((previous_data.isnull().sum())*100/previous_data.shape[0]).reset_index()
null_previous_data.columns = ['Column Name', 'Null Values Percentage']
fig = plt.figure(figsize=(18,6))
ax = sns.pointplot(x="Column Name",y="Null Values Percentage",data=null_previous_data,color ='red')
plt.xticks(rotation =90,fontsize =7)
ax.axhline(40, ls='--',color='green')
plt.title("Percentage of Missing values in previousDF data")
plt.ylabel("Null Values PERCENTAGE")
plt.xlabel("COLUMNS")
plt.show()
null40 = null_previous_data[null_previous_data["Null Values Percentage"]>=40]
null40
| Column Name | Null Values Percentage | |
|---|---|---|
| 6 | AMT_DOWN_PAYMENT | 53.636480 |
| 12 | RATE_DOWN_PAYMENT | 53.636480 |
| 13 | RATE_INTEREST_PRIMARY | 99.643698 |
| 14 | RATE_INTEREST_PRIVILEGED | 99.643698 |
| 20 | NAME_TYPE_SUITE | 49.119754 |
| 31 | DAYS_FIRST_DRAWING | 40.298129 |
| 32 | DAYS_FIRST_DUE | 40.298129 |
| 33 | DAYS_LAST_DUE_1ST_VERSION | 40.298129 |
| 34 | DAYS_LAST_DUE | 40.298129 |
| 35 | DAYS_TERMINATION | 40.298129 |
| 36 | NFLAG_INSURED_ON_APPROVAL | 40.298129 |
len(null40)
11
EXT_SOURCE_X
a = bank_data[["EXT_SOURCE_1","EXT_SOURCE_2","EXT_SOURCE_3","TARGET"]]
a
| EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | TARGET | |
|---|---|---|---|---|
| 0 | 0.083037 | 0.262949 | 0.139376 | 1 |
| 1 | 0.311267 | 0.622246 | NaN | 0 |
| 2 | NaN | 0.555912 | 0.729567 | 0 |
| 3 | NaN | 0.650442 | NaN | 0 |
| 4 | NaN | 0.322738 | NaN | 0 |
| ... | ... | ... | ... | ... |
| 307506 | 0.145570 | 0.681632 | NaN | 0 |
| 307507 | NaN | 0.115992 | NaN | 0 |
| 307508 | 0.744026 | 0.535722 | 0.218859 | 0 |
| 307509 | NaN | 0.514163 | 0.661024 | 1 |
| 307510 | 0.734460 | 0.708569 | 0.113922 | 0 |
307511 rows × 4 columns
sns.heatmap(a.corr(), annot = True, cmap = "RdYlGn" )
<Axes: >
Unwanted_application = null_40["Column Name"].tolist()+ ['EXT_SOURCE_2','EXT_SOURCE_3']
Unwanted_application
['OWN_CAR_AGE', 'EXT_SOURCE_1', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'EXT_SOURCE_2', 'EXT_SOURCE_3']
len(Unwanted_application )
51
# Checking the relevance of Flag_Document and whether it has any relation with loan repayment status
col_document = ['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3','FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6','FLAG_DOCUMENT_7',
'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9','FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12','FLAG_DOCUMENT_13',
'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15','FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']
flag_data = bank_data[col_document+["TARGET"]]
# Assuming col_document is a list or an iterable containing the column names
length = len(col_document)
flag_data["TARGET"] = flag_data["TARGET"].replace({1:"Defaulter",0:"Repayer"})
fig = plt.figure(figsize=(21,24))
for i, j in itertools.zip_longest(col_document, range(length)):
plt.subplot(5,4,j+1)
ax = sns.countplot(x = flag_data[i], hue = flag_data["TARGET"], palette = ["r","b"])
col_document.remove('FLAG_DOCUMENT_3')
Unwanted_application = Unwanted_application + col_document
len(Unwanted_application)
70
# checking is there is any correlation between mobile phone, work phone etc, email, Family members and Region rating
contact_column = ['FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE',
'FLAG_PHONE', 'FLAG_EMAIL','TARGET']
contact_corr = bank_data[contact_column].corr()
contact_corr
| FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | TARGET | |
|---|---|---|---|---|---|---|---|
| FLAG_MOBIL | 1.000000 | -0.000845 | 0.000900 | -0.000078 | 0.001128 | 0.000442 | 0.000534 |
| FLAG_EMP_PHONE | -0.000845 | 1.000000 | 0.233801 | -0.012819 | -0.016131 | 0.062542 | 0.045982 |
| FLAG_WORK_PHONE | 0.000900 | 0.233801 | 1.000000 | 0.021580 | 0.293105 | -0.011520 | 0.028524 |
| FLAG_CONT_MOBILE | -0.000078 | -0.012819 | 0.021580 | 1.000000 | 0.006257 | -0.005356 | 0.000370 |
| FLAG_PHONE | 0.001128 | -0.016131 | 0.293105 | 0.006257 | 1.000000 | 0.014657 | -0.023806 |
| FLAG_EMAIL | 0.000442 | 0.062542 | -0.011520 | -0.005356 | 0.014657 | 1.000000 | -0.001758 |
| TARGET | 0.000534 | 0.045982 | 0.028524 | 0.000370 | -0.023806 | -0.001758 | 1.000000 |
fig = plt.figure(figsize = (10,8))
ax = sns.heatmap(contact_corr)
ax = sns.heatmap(contact_corr, annot = True, cmap = ("RdYlGn"))
contact_column.remove('TARGET')
Unwanted_application = Unwanted_application + contact_column
len(Unwanted_application)
76
Total 114 columns can be deleted from applicationDF
# Dropping the unnecessary columns from applicationDF
bank_data.drop( labels = Unwanted_application, axis = 1, inplace = True)
bank_data.shape
(307511, 46)
bank_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307511 entries, 0 to 307510 Data columns (total 46 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_CURR 307511 non-null int64 1 TARGET 307511 non-null int64 2 NAME_CONTRACT_TYPE 307511 non-null object 3 CODE_GENDER 307511 non-null object 4 FLAG_OWN_CAR 307511 non-null object 5 FLAG_OWN_REALTY 307511 non-null object 6 CNT_CHILDREN 307511 non-null int64 7 AMT_INCOME_TOTAL 307511 non-null float64 8 AMT_CREDIT 307511 non-null float64 9 AMT_ANNUITY 307499 non-null float64 10 AMT_GOODS_PRICE 307233 non-null float64 11 NAME_TYPE_SUITE 306219 non-null object 12 NAME_INCOME_TYPE 307511 non-null object 13 NAME_EDUCATION_TYPE 307511 non-null object 14 NAME_FAMILY_STATUS 307511 non-null object 15 NAME_HOUSING_TYPE 307511 non-null object 16 REGION_POPULATION_RELATIVE 307511 non-null float64 17 DAYS_BIRTH 307511 non-null int64 18 DAYS_EMPLOYED 307511 non-null int64 19 DAYS_REGISTRATION 307511 non-null float64 20 DAYS_ID_PUBLISH 307511 non-null int64 21 OCCUPATION_TYPE 211120 non-null object 22 CNT_FAM_MEMBERS 307509 non-null float64 23 REGION_RATING_CLIENT 307511 non-null int64 24 REGION_RATING_CLIENT_W_CITY 307511 non-null int64 25 WEEKDAY_APPR_PROCESS_START 307511 non-null object 26 HOUR_APPR_PROCESS_START 307511 non-null int64 27 REG_REGION_NOT_LIVE_REGION 307511 non-null int64 28 REG_REGION_NOT_WORK_REGION 307511 non-null int64 29 LIVE_REGION_NOT_WORK_REGION 307511 non-null int64 30 REG_CITY_NOT_LIVE_CITY 307511 non-null int64 31 REG_CITY_NOT_WORK_CITY 307511 non-null int64 32 LIVE_CITY_NOT_WORK_CITY 307511 non-null int64 33 ORGANIZATION_TYPE 307511 non-null object 34 OBS_30_CNT_SOCIAL_CIRCLE 306490 non-null float64 35 DEF_30_CNT_SOCIAL_CIRCLE 306490 non-null float64 36 OBS_60_CNT_SOCIAL_CIRCLE 306490 non-null float64 37 DEF_60_CNT_SOCIAL_CIRCLE 306490 non-null float64 38 DAYS_LAST_PHONE_CHANGE 307510 non-null float64 39 FLAG_DOCUMENT_3 307511 non-null int64 40 AMT_REQ_CREDIT_BUREAU_HOUR 265992 non-null float64 41 AMT_REQ_CREDIT_BUREAU_DAY 265992 non-null float64 42 AMT_REQ_CREDIT_BUREAU_WEEK 265992 non-null float64 43 AMT_REQ_CREDIT_BUREAU_MON 265992 non-null float64 44 AMT_REQ_CREDIT_BUREAU_QRT 265992 non-null float64 45 AMT_REQ_CREDIT_BUREAU_YEAR 265992 non-null float64 dtypes: float64(18), int64(16), object(12) memory usage: 107.9+ MB
##### Insight:
##### After deleting unnecessary columns, there are 46 columns remaining in bank_data
## Analyze & Delete Unnecessary Columns in previousDF
# Getting the 11 columns which has more than 40% unknown
Unwanted_previous_data = null40["Column Name"].tolist()
Unwanted_previous_data
['AMT_DOWN_PAYMENT', 'RATE_DOWN_PAYMENT', 'RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED', 'NAME_TYPE_SUITE', 'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION', 'NFLAG_INSURED_ON_APPROVAL']
##### Listing down columns which are not needed
Unnecessary_previous = ['WEEKDAY_APPR_PROCESS_START','HOUR_APPR_PROCESS_START',
'FLAG_LAST_APPL_PER_CONTRACT','NFLAG_LAST_APPL_IN_DAY']
Unwanted_previous = Unwanted_previous_data + Unnecessary_previous
len(Unwanted_previous)
15
Unwanted_previous
['AMT_DOWN_PAYMENT', 'RATE_DOWN_PAYMENT', 'RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED', 'NAME_TYPE_SUITE', 'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION', 'NFLAG_INSURED_ON_APPROVAL', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NFLAG_LAST_APPL_IN_DAY']
# Insight:
# Total 15 columns can be deleted from previous_data
# Dropping the unnecessary columns from previous
previous_data.drop(labels = Unnecessary_previous, axis = 1, inplace = True)
# Inspecting the dataframe after removal of unnecessary columns
previous_data.shape
(1670214, 33)
previous_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1670214 entries, 0 to 1670213 Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_PREV 1670214 non-null int64 1 SK_ID_CURR 1670214 non-null int64 2 NAME_CONTRACT_TYPE 1670214 non-null object 3 AMT_ANNUITY 1297979 non-null float64 4 AMT_APPLICATION 1670214 non-null float64 5 AMT_CREDIT 1670213 non-null float64 6 AMT_DOWN_PAYMENT 774370 non-null float64 7 AMT_GOODS_PRICE 1284699 non-null float64 8 RATE_DOWN_PAYMENT 774370 non-null float64 9 RATE_INTEREST_PRIMARY 5951 non-null float64 10 RATE_INTEREST_PRIVILEGED 5951 non-null float64 11 NAME_CASH_LOAN_PURPOSE 1670214 non-null object 12 NAME_CONTRACT_STATUS 1670214 non-null object 13 DAYS_DECISION 1670214 non-null int64 14 NAME_PAYMENT_TYPE 1670214 non-null object 15 CODE_REJECT_REASON 1670214 non-null object 16 NAME_TYPE_SUITE 849809 non-null object 17 NAME_CLIENT_TYPE 1670214 non-null object 18 NAME_GOODS_CATEGORY 1670214 non-null object 19 NAME_PORTFOLIO 1670214 non-null object 20 NAME_PRODUCT_TYPE 1670214 non-null object 21 CHANNEL_TYPE 1670214 non-null object 22 SELLERPLACE_AREA 1670214 non-null int64 23 NAME_SELLER_INDUSTRY 1670214 non-null object 24 CNT_PAYMENT 1297984 non-null float64 25 NAME_YIELD_GROUP 1670214 non-null object 26 PRODUCT_COMBINATION 1669868 non-null object 27 DAYS_FIRST_DRAWING 997149 non-null float64 28 DAYS_FIRST_DUE 997149 non-null float64 29 DAYS_LAST_DUE_1ST_VERSION 997149 non-null float64 30 DAYS_LAST_DUE 997149 non-null float64 31 DAYS_TERMINATION 997149 non-null float64 32 NFLAG_INSURED_ON_APPROVAL 997149 non-null float64 dtypes: float64(15), int64(4), object(14) memory usage: 420.5+ MB
# Insight:
# After deleting unnecessary columns, there are 22 columns remaining in applicationDF
# Standardize Values
# Strategy for bank_data:
# Convert DAYS_DECISION,DAYS_EMPLOYED, DAYS_REGISTRATION,DAYS_ID_PUBLISH from negative to positive as days cannot be negative.
# Convert DAYS_BIRTH from negative to positive values and calculate age and create categorical bins columns
# Categorize the amount variables into bins
# Convert region rating column and few other columns to categorical
# Converting Negative days to positive days
date_column = ['DAYS_BIRTH','DAYS_EMPLOYED','DAYS_REGISTRATION','DAYS_ID_PUBLISH']
for col in date_column:
bank_data[col] = abs(bank_data[col])
# Binning Numerical Columns to create a categorical column
# Creating bins for income amount
bank_data['AMT_INCOME_TOTAL'] = bank_data['AMT_INCOME_TOTAL']/100000
bins = [0,1,2,3,4,5,6,7,8,9,10,11]
slot = ['0-100K','100K-200K', '200k-300k','300k-400k','400k-500k','500k-600k','600k-700k','700k-800k','800k-900k','900k-1M', '1M Above']
bank_data['AMT_INCOME_RANGE'] = pd.cut(bank_data['AMT_INCOME_TOTAL'], bins, labels = slot)
bank_data['AMT_INCOME_RANGE'].value_counts(normalize=True)*100
AMT_INCOME_RANGE 100K-200K 50.735000 200k-300k 21.210691 0-100K 20.729695 300k-400k 4.776116 400k-500k 1.744669 500k-600k 0.356354 600k-700k 0.282805 800k-900k 0.096980 700k-800k 0.052721 900k-1M 0.009112 1M Above 0.005858 Name: proportion, dtype: float64
# Insight:
# More than 50% loan applicants have income amount in the range of 100K-200K. Almost 92% loan applicants have income less than 300K
# Creating bins for Credit amount
bank_data['AMT_CREDIT'] = bank_data['AMT_CREDIT']/100000
bins = [0,1,2,3,4,5,6,7,8,9,10,100]
slots = ['0-100K','100K-200K', '200k-300k','300k-400k','400k-500k','500k-600k','600k-700k','700k-800k',
'800k-900k','900k-1M', '1M Above']
bank_data['AMT_CREDIT_RANGE']=pd.cut(bank_data['AMT_CREDIT'], bins = bins, labels = slots)
#checking the binning of data and % of data in each category
bank_data['AMT_CREDIT_RANGE'].value_counts(normalize=True)*100
AMT_CREDIT_RANGE 200k-300k 17.824728 1M Above 16.254703 500k-600k 11.131960 400k-500k 10.418489 100K-200K 9.801275 300k-400k 8.564897 600k-700k 7.820533 800k-900k 7.086576 700k-800k 6.241403 900k-1M 2.902986 0-100K 1.952450 Name: proportion, dtype: float64
# Insight:
# More Than 16% loan applicants have taken loan which amounts to more than 1M
# Creating bins for Age
bank_data['AGE'] = bank_data['DAYS_BIRTH'] // 365
bins = [0,20,30,40,50,100]
slots = ['0-20','20-30','30-40','40-50','50 above']
bank_data['AGE_GROUP'] = pd.cut( bank_data['AGE'], bins = bins, labels = slots)
# checking the binning of data and % of data in each category
bank_data['AGE_GROUP'].value_counts(normalize=True)*100
AGE_GROUP 50 above 31.604398 30-40 27.028952 40-50 24.194582 20-30 17.171743 0-20 0.000325 Name: proportion, dtype: float64
## Insight:
## 31% loan applicants have age above 50 years. More than 55% of loan applicants have age over 40 years
# Creating bins for Employement Time
bank_data['YEARS_EMPLOYED'] = bank_data['DAYS_EMPLOYED'] // 365
bins = [0,5,10,20,30,40,50,60,150]
slots = ['0-5','5-10','10-20','20-30','30-40','40-50','50-60','60 above']
bank_data['EMPLOYMENT_YEAR']=pd.cut(bank_data['YEARS_EMPLOYED'],bins=bins,labels=slots)
#checking the binning of data and % of data in each category
bank_data['EMPLOYMENT_YEAR'].value_counts(normalize=True)*100
EMPLOYMENT_YEAR 0-5 55.582363 5-10 24.966441 10-20 14.564315 20-30 3.750117 30-40 1.058720 40-50 0.078044 50-60 0.000000 60 above 0.000000 Name: proportion, dtype: float64
# Insight:
# More than 55% of the loan applicants have work experience within 0-5 years and almost 80% of them have less than 10 years of work experience
#Checking the number of unique values each column possess to identify categorical columns
bank_data.nunique()
SK_ID_CURR 307511 TARGET 2 NAME_CONTRACT_TYPE 2 CODE_GENDER 3 FLAG_OWN_CAR 2 FLAG_OWN_REALTY 2 CNT_CHILDREN 15 AMT_INCOME_TOTAL 2548 AMT_CREDIT 5603 AMT_ANNUITY 13672 AMT_GOODS_PRICE 1002 NAME_TYPE_SUITE 7 NAME_INCOME_TYPE 8 NAME_EDUCATION_TYPE 5 NAME_FAMILY_STATUS 6 NAME_HOUSING_TYPE 6 REGION_POPULATION_RELATIVE 81 DAYS_BIRTH 17460 DAYS_EMPLOYED 12574 DAYS_REGISTRATION 15688 DAYS_ID_PUBLISH 6168 OCCUPATION_TYPE 18 CNT_FAM_MEMBERS 17 REGION_RATING_CLIENT 3 REGION_RATING_CLIENT_W_CITY 3 WEEKDAY_APPR_PROCESS_START 7 HOUR_APPR_PROCESS_START 24 REG_REGION_NOT_LIVE_REGION 2 REG_REGION_NOT_WORK_REGION 2 LIVE_REGION_NOT_WORK_REGION 2 REG_CITY_NOT_LIVE_CITY 2 REG_CITY_NOT_WORK_CITY 2 LIVE_CITY_NOT_WORK_CITY 2 ORGANIZATION_TYPE 58 OBS_30_CNT_SOCIAL_CIRCLE 33 DEF_30_CNT_SOCIAL_CIRCLE 10 OBS_60_CNT_SOCIAL_CIRCLE 33 DEF_60_CNT_SOCIAL_CIRCLE 9 DAYS_LAST_PHONE_CHANGE 3773 FLAG_DOCUMENT_3 2 AMT_REQ_CREDIT_BUREAU_HOUR 5 AMT_REQ_CREDIT_BUREAU_DAY 9 AMT_REQ_CREDIT_BUREAU_WEEK 9 AMT_REQ_CREDIT_BUREAU_MON 24 AMT_REQ_CREDIT_BUREAU_QRT 11 AMT_REQ_CREDIT_BUREAU_YEAR 25 AMT_INCOME_RANGE 11 AMT_CREDIT_RANGE 11 AGE 50 AGE_GROUP 5 YEARS_EMPLOYED 51 EMPLOYMENT_YEAR 6 dtype: int64
bank_data.nunique().sort_values()
LIVE_CITY_NOT_WORK_CITY 2 TARGET 2 NAME_CONTRACT_TYPE 2 REG_REGION_NOT_LIVE_REGION 2 FLAG_OWN_CAR 2 FLAG_OWN_REALTY 2 REG_REGION_NOT_WORK_REGION 2 LIVE_REGION_NOT_WORK_REGION 2 FLAG_DOCUMENT_3 2 REG_CITY_NOT_LIVE_CITY 2 REG_CITY_NOT_WORK_CITY 2 REGION_RATING_CLIENT 3 CODE_GENDER 3 REGION_RATING_CLIENT_W_CITY 3 AMT_REQ_CREDIT_BUREAU_HOUR 5 NAME_EDUCATION_TYPE 5 AGE_GROUP 5 NAME_FAMILY_STATUS 6 NAME_HOUSING_TYPE 6 EMPLOYMENT_YEAR 6 WEEKDAY_APPR_PROCESS_START 7 NAME_TYPE_SUITE 7 NAME_INCOME_TYPE 8 AMT_REQ_CREDIT_BUREAU_WEEK 9 AMT_REQ_CREDIT_BUREAU_DAY 9 DEF_60_CNT_SOCIAL_CIRCLE 9 DEF_30_CNT_SOCIAL_CIRCLE 10 AMT_CREDIT_RANGE 11 AMT_INCOME_RANGE 11 AMT_REQ_CREDIT_BUREAU_QRT 11 CNT_CHILDREN 15 CNT_FAM_MEMBERS 17 OCCUPATION_TYPE 18 HOUR_APPR_PROCESS_START 24 AMT_REQ_CREDIT_BUREAU_MON 24 AMT_REQ_CREDIT_BUREAU_YEAR 25 OBS_60_CNT_SOCIAL_CIRCLE 33 OBS_30_CNT_SOCIAL_CIRCLE 33 AGE 50 YEARS_EMPLOYED 51 ORGANIZATION_TYPE 58 REGION_POPULATION_RELATIVE 81 AMT_GOODS_PRICE 1002 AMT_INCOME_TOTAL 2548 DAYS_LAST_PHONE_CHANGE 3773 AMT_CREDIT 5603 DAYS_ID_PUBLISH 6168 DAYS_EMPLOYED 12574 AMT_ANNUITY 13672 DAYS_REGISTRATION 15688 DAYS_BIRTH 17460 SK_ID_CURR 307511 dtype: int64
# inspecting the column types if they are in correct data type using the above result.
bank_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307511 entries, 0 to 307510 Data columns (total 52 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_CURR 307511 non-null int64 1 TARGET 307511 non-null int64 2 NAME_CONTRACT_TYPE 307511 non-null object 3 CODE_GENDER 307511 non-null object 4 FLAG_OWN_CAR 307511 non-null object 5 FLAG_OWN_REALTY 307511 non-null object 6 CNT_CHILDREN 307511 non-null int64 7 AMT_INCOME_TOTAL 307511 non-null float64 8 AMT_CREDIT 307511 non-null float64 9 AMT_ANNUITY 307499 non-null float64 10 AMT_GOODS_PRICE 307233 non-null float64 11 NAME_TYPE_SUITE 306219 non-null object 12 NAME_INCOME_TYPE 307511 non-null object 13 NAME_EDUCATION_TYPE 307511 non-null object 14 NAME_FAMILY_STATUS 307511 non-null object 15 NAME_HOUSING_TYPE 307511 non-null object 16 REGION_POPULATION_RELATIVE 307511 non-null float64 17 DAYS_BIRTH 307511 non-null int64 18 DAYS_EMPLOYED 307511 non-null int64 19 DAYS_REGISTRATION 307511 non-null float64 20 DAYS_ID_PUBLISH 307511 non-null int64 21 OCCUPATION_TYPE 211120 non-null object 22 CNT_FAM_MEMBERS 307509 non-null float64 23 REGION_RATING_CLIENT 307511 non-null int64 24 REGION_RATING_CLIENT_W_CITY 307511 non-null int64 25 WEEKDAY_APPR_PROCESS_START 307511 non-null object 26 HOUR_APPR_PROCESS_START 307511 non-null int64 27 REG_REGION_NOT_LIVE_REGION 307511 non-null int64 28 REG_REGION_NOT_WORK_REGION 307511 non-null int64 29 LIVE_REGION_NOT_WORK_REGION 307511 non-null int64 30 REG_CITY_NOT_LIVE_CITY 307511 non-null int64 31 REG_CITY_NOT_WORK_CITY 307511 non-null int64 32 LIVE_CITY_NOT_WORK_CITY 307511 non-null int64 33 ORGANIZATION_TYPE 307511 non-null object 34 OBS_30_CNT_SOCIAL_CIRCLE 306490 non-null float64 35 DEF_30_CNT_SOCIAL_CIRCLE 306490 non-null float64 36 OBS_60_CNT_SOCIAL_CIRCLE 306490 non-null float64 37 DEF_60_CNT_SOCIAL_CIRCLE 306490 non-null float64 38 DAYS_LAST_PHONE_CHANGE 307510 non-null float64 39 FLAG_DOCUMENT_3 307511 non-null int64 40 AMT_REQ_CREDIT_BUREAU_HOUR 265992 non-null float64 41 AMT_REQ_CREDIT_BUREAU_DAY 265992 non-null float64 42 AMT_REQ_CREDIT_BUREAU_WEEK 265992 non-null float64 43 AMT_REQ_CREDIT_BUREAU_MON 265992 non-null float64 44 AMT_REQ_CREDIT_BUREAU_QRT 265992 non-null float64 45 AMT_REQ_CREDIT_BUREAU_YEAR 265992 non-null float64 46 AMT_INCOME_RANGE 307279 non-null category 47 AMT_CREDIT_RANGE 307511 non-null category 48 AGE 307511 non-null int64 49 AGE_GROUP 307511 non-null category 50 YEARS_EMPLOYED 307511 non-null int64 51 EMPLOYMENT_YEAR 224233 non-null category dtypes: category(4), float64(18), int64(18), object(12) memory usage: 113.8+ MB
# Insight:
# Numeric columns are already in int64 and float64 format. Hence proceeding with other columns.
# Conversion of Object and Numerical columns to Categorical Columns
categorical_columns = ['NAME_CONTRACT_TYPE','CODE_GENDER','NAME_TYPE_SUITE','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE',
'NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START',
'ORGANIZATION_TYPE','FLAG_OWN_CAR','FLAG_OWN_REALTY','LIVE_CITY_NOT_WORK_CITY',
'REG_CITY_NOT_LIVE_CITY','REG_CITY_NOT_WORK_CITY','REG_REGION_NOT_WORK_REGION',
'LIVE_REGION_NOT_WORK_REGION','REGION_RATING_CLIENT','WEEKDAY_APPR_PROCESS_START',
'REGION_RATING_CLIENT_W_CITY'
]
for col in categorical_columns:
bank_data[col] = pd.Categorical(bank_data[col])
bank_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307511 entries, 0 to 307510 Data columns (total 52 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_CURR 307511 non-null int64 1 TARGET 307511 non-null int64 2 NAME_CONTRACT_TYPE 307511 non-null category 3 CODE_GENDER 307511 non-null category 4 FLAG_OWN_CAR 307511 non-null category 5 FLAG_OWN_REALTY 307511 non-null category 6 CNT_CHILDREN 307511 non-null int64 7 AMT_INCOME_TOTAL 307511 non-null float64 8 AMT_CREDIT 307511 non-null float64 9 AMT_ANNUITY 307499 non-null float64 10 AMT_GOODS_PRICE 307233 non-null float64 11 NAME_TYPE_SUITE 306219 non-null category 12 NAME_INCOME_TYPE 307511 non-null category 13 NAME_EDUCATION_TYPE 307511 non-null category 14 NAME_FAMILY_STATUS 307511 non-null category 15 NAME_HOUSING_TYPE 307511 non-null category 16 REGION_POPULATION_RELATIVE 307511 non-null float64 17 DAYS_BIRTH 307511 non-null int64 18 DAYS_EMPLOYED 307511 non-null int64 19 DAYS_REGISTRATION 307511 non-null float64 20 DAYS_ID_PUBLISH 307511 non-null int64 21 OCCUPATION_TYPE 211120 non-null category 22 CNT_FAM_MEMBERS 307509 non-null float64 23 REGION_RATING_CLIENT 307511 non-null category 24 REGION_RATING_CLIENT_W_CITY 307511 non-null category 25 WEEKDAY_APPR_PROCESS_START 307511 non-null category 26 HOUR_APPR_PROCESS_START 307511 non-null int64 27 REG_REGION_NOT_LIVE_REGION 307511 non-null int64 28 REG_REGION_NOT_WORK_REGION 307511 non-null category 29 LIVE_REGION_NOT_WORK_REGION 307511 non-null category 30 REG_CITY_NOT_LIVE_CITY 307511 non-null category 31 REG_CITY_NOT_WORK_CITY 307511 non-null category 32 LIVE_CITY_NOT_WORK_CITY 307511 non-null category 33 ORGANIZATION_TYPE 307511 non-null category 34 OBS_30_CNT_SOCIAL_CIRCLE 306490 non-null float64 35 DEF_30_CNT_SOCIAL_CIRCLE 306490 non-null float64 36 OBS_60_CNT_SOCIAL_CIRCLE 306490 non-null float64 37 DEF_60_CNT_SOCIAL_CIRCLE 306490 non-null float64 38 DAYS_LAST_PHONE_CHANGE 307510 non-null float64 39 FLAG_DOCUMENT_3 307511 non-null int64 40 AMT_REQ_CREDIT_BUREAU_HOUR 265992 non-null float64 41 AMT_REQ_CREDIT_BUREAU_DAY 265992 non-null float64 42 AMT_REQ_CREDIT_BUREAU_WEEK 265992 non-null float64 43 AMT_REQ_CREDIT_BUREAU_MON 265992 non-null float64 44 AMT_REQ_CREDIT_BUREAU_QRT 265992 non-null float64 45 AMT_REQ_CREDIT_BUREAU_YEAR 265992 non-null float64 46 AMT_INCOME_RANGE 307279 non-null category 47 AMT_CREDIT_RANGE 307511 non-null category 48 AGE 307511 non-null int64 49 AGE_GROUP 307511 non-null category 50 YEARS_EMPLOYED 307511 non-null int64 51 EMPLOYMENT_YEAR 224233 non-null category dtypes: category(23), float64(18), int64(11) memory usage: 74.8 MB
## Standardize Values for previous_data
# Strategy for previousDF:
# Convert DAYS_DECISION from negative to positive values and create categorical bins columns.
# Convert loan purpose and few other columns to categorical.
# Checking the number of unique values each column possess to identify categorical columns
previous_data.nunique()
SK_ID_PREV 1670214 SK_ID_CURR 338857 NAME_CONTRACT_TYPE 4 AMT_ANNUITY 357959 AMT_APPLICATION 93885 AMT_CREDIT 86803 AMT_DOWN_PAYMENT 29278 AMT_GOODS_PRICE 93885 RATE_DOWN_PAYMENT 207033 RATE_INTEREST_PRIMARY 148 RATE_INTEREST_PRIVILEGED 25 NAME_CASH_LOAN_PURPOSE 25 NAME_CONTRACT_STATUS 4 DAYS_DECISION 2922 NAME_PAYMENT_TYPE 4 CODE_REJECT_REASON 9 NAME_TYPE_SUITE 7 NAME_CLIENT_TYPE 4 NAME_GOODS_CATEGORY 28 NAME_PORTFOLIO 5 NAME_PRODUCT_TYPE 3 CHANNEL_TYPE 8 SELLERPLACE_AREA 2097 NAME_SELLER_INDUSTRY 11 CNT_PAYMENT 49 NAME_YIELD_GROUP 5 PRODUCT_COMBINATION 17 DAYS_FIRST_DRAWING 2838 DAYS_FIRST_DUE 2892 DAYS_LAST_DUE_1ST_VERSION 4605 DAYS_LAST_DUE 2873 DAYS_TERMINATION 2830 NFLAG_INSURED_ON_APPROVAL 2 dtype: int64
previous_data.nunique().sort_values()
NFLAG_INSURED_ON_APPROVAL 2 NAME_PRODUCT_TYPE 3 NAME_CONTRACT_TYPE 4 NAME_CLIENT_TYPE 4 NAME_PAYMENT_TYPE 4 NAME_CONTRACT_STATUS 4 NAME_YIELD_GROUP 5 NAME_PORTFOLIO 5 NAME_TYPE_SUITE 7 CHANNEL_TYPE 8 CODE_REJECT_REASON 9 NAME_SELLER_INDUSTRY 11 PRODUCT_COMBINATION 17 RATE_INTEREST_PRIVILEGED 25 NAME_CASH_LOAN_PURPOSE 25 NAME_GOODS_CATEGORY 28 CNT_PAYMENT 49 RATE_INTEREST_PRIMARY 148 SELLERPLACE_AREA 2097 DAYS_TERMINATION 2830 DAYS_FIRST_DRAWING 2838 DAYS_LAST_DUE 2873 DAYS_FIRST_DUE 2892 DAYS_DECISION 2922 DAYS_LAST_DUE_1ST_VERSION 4605 AMT_DOWN_PAYMENT 29278 AMT_CREDIT 86803 AMT_GOODS_PRICE 93885 AMT_APPLICATION 93885 RATE_DOWN_PAYMENT 207033 SK_ID_CURR 338857 AMT_ANNUITY 357959 SK_ID_PREV 1670214 dtype: int64
# inspecting the column types if the above conversion is reflected
previous_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1670214 entries, 0 to 1670213 Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_PREV 1670214 non-null int64 1 SK_ID_CURR 1670214 non-null int64 2 NAME_CONTRACT_TYPE 1670214 non-null object 3 AMT_ANNUITY 1297979 non-null float64 4 AMT_APPLICATION 1670214 non-null float64 5 AMT_CREDIT 1670213 non-null float64 6 AMT_DOWN_PAYMENT 774370 non-null float64 7 AMT_GOODS_PRICE 1284699 non-null float64 8 RATE_DOWN_PAYMENT 774370 non-null float64 9 RATE_INTEREST_PRIMARY 5951 non-null float64 10 RATE_INTEREST_PRIVILEGED 5951 non-null float64 11 NAME_CASH_LOAN_PURPOSE 1670214 non-null object 12 NAME_CONTRACT_STATUS 1670214 non-null object 13 DAYS_DECISION 1670214 non-null int64 14 NAME_PAYMENT_TYPE 1670214 non-null object 15 CODE_REJECT_REASON 1670214 non-null object 16 NAME_TYPE_SUITE 849809 non-null object 17 NAME_CLIENT_TYPE 1670214 non-null object 18 NAME_GOODS_CATEGORY 1670214 non-null object 19 NAME_PORTFOLIO 1670214 non-null object 20 NAME_PRODUCT_TYPE 1670214 non-null object 21 CHANNEL_TYPE 1670214 non-null object 22 SELLERPLACE_AREA 1670214 non-null int64 23 NAME_SELLER_INDUSTRY 1670214 non-null object 24 CNT_PAYMENT 1297984 non-null float64 25 NAME_YIELD_GROUP 1670214 non-null object 26 PRODUCT_COMBINATION 1669868 non-null object 27 DAYS_FIRST_DRAWING 997149 non-null float64 28 DAYS_FIRST_DUE 997149 non-null float64 29 DAYS_LAST_DUE_1ST_VERSION 997149 non-null float64 30 DAYS_LAST_DUE 997149 non-null float64 31 DAYS_TERMINATION 997149 non-null float64 32 NFLAG_INSURED_ON_APPROVAL 997149 non-null float64 dtypes: float64(15), int64(4), object(14) memory usage: 420.5+ MB
#Converting negative days to positive days
previous_data['DAYS_DECISION'] = abs(previous_data['DAYS_DECISION'])
previous_data['DAYS_DECISION'] = abs(previous_data['DAYS_DECISION'])
previous_data['DAYS_DECISION']
0 73
1 164
2 301
3 512
4 781
...
1670209 544
1670210 1694
1670211 1488
1670212 1185
1670213 1193
Name: DAYS_DECISION, Length: 1670214, dtype: int64
# age group calculation e.g. 388 will be grouped as 300-400
previous_data['DAYS_DECISION_GROUP'] = (previous_data['DAYS_DECISION']-(previous_data['DAYS_DECISION'] % 400)).astype(str)+'-'+ ((previous_data['DAYS_DECISION'] - (previous_data['DAYS_DECISION'] % 400)) + (previous_data['DAYS_DECISION'] % 400) + (400 - (previous_data['DAYS_DECISION'] % 400))).astype(str)
previous_data['DAYS_DECISION_GROUP']
0 0-400
1 0-400
2 0-400
3 400-800
4 400-800
...
1670209 400-800
1670210 1600-2000
1670211 1200-1600
1670212 800-1200
1670213 800-1200
Name: DAYS_DECISION_GROUP, Length: 1670214, dtype: object
previous_data['DAYS_DECISION_GROUP'].value_counts(normalize=True)*100
DAYS_DECISION_GROUP 0-400 37.490525 400-800 22.944724 800-1200 12.444753 1200-1600 7.904556 2400-2800 6.297456 1600-2000 5.795784 2000-2400 5.684960 2800-3200 1.437241 Name: proportion, dtype: float64
# Insight:
# Almost 37% loan applicatants have applied for a new loan within 0-400 days of previous loan decision
# Converting Categorical columns from Object to categorical
Catgorical_col_p = ['NAME_CASH_LOAN_PURPOSE','NAME_CONTRACT_STATUS','NAME_PAYMENT_TYPE',
'CODE_REJECT_REASON','NAME_CLIENT_TYPE','NAME_GOODS_CATEGORY','NAME_PORTFOLIO',
'NAME_PRODUCT_TYPE','CHANNEL_TYPE','NAME_SELLER_INDUSTRY','NAME_YIELD_GROUP','PRODUCT_COMBINATION',
'NAME_CONTRACT_TYPE','DAYS_DECISION_GROUP']
for col in Catgorical_col_p:
previous_data[col] =pd.Categorical(previous_data[col])
# inspecting the column types after conversion
previous_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1670214 entries, 0 to 1670213 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_PREV 1670214 non-null int64 1 SK_ID_CURR 1670214 non-null int64 2 NAME_CONTRACT_TYPE 1670214 non-null category 3 AMT_ANNUITY 1297979 non-null float64 4 AMT_APPLICATION 1670214 non-null float64 5 AMT_CREDIT 1670213 non-null float64 6 AMT_DOWN_PAYMENT 774370 non-null float64 7 AMT_GOODS_PRICE 1284699 non-null float64 8 RATE_DOWN_PAYMENT 774370 non-null float64 9 RATE_INTEREST_PRIMARY 5951 non-null float64 10 RATE_INTEREST_PRIVILEGED 5951 non-null float64 11 NAME_CASH_LOAN_PURPOSE 1670214 non-null category 12 NAME_CONTRACT_STATUS 1670214 non-null category 13 DAYS_DECISION 1670214 non-null int64 14 NAME_PAYMENT_TYPE 1670214 non-null category 15 CODE_REJECT_REASON 1670214 non-null category 16 NAME_TYPE_SUITE 849809 non-null object 17 NAME_CLIENT_TYPE 1670214 non-null category 18 NAME_GOODS_CATEGORY 1670214 non-null category 19 NAME_PORTFOLIO 1670214 non-null category 20 NAME_PRODUCT_TYPE 1670214 non-null category 21 CHANNEL_TYPE 1670214 non-null category 22 SELLERPLACE_AREA 1670214 non-null int64 23 NAME_SELLER_INDUSTRY 1670214 non-null category 24 CNT_PAYMENT 1297984 non-null float64 25 NAME_YIELD_GROUP 1670214 non-null category 26 PRODUCT_COMBINATION 1669868 non-null category 27 DAYS_FIRST_DRAWING 997149 non-null float64 28 DAYS_FIRST_DUE 997149 non-null float64 29 DAYS_LAST_DUE_1ST_VERSION 997149 non-null float64 30 DAYS_LAST_DUE 997149 non-null float64 31 DAYS_TERMINATION 997149 non-null float64 32 NFLAG_INSURED_ON_APPROVAL 997149 non-null float64 33 DAYS_DECISION_GROUP 1670214 non-null category dtypes: category(14), float64(15), int64(4), object(1) memory usage: 277.2+ MB
# 4.6 Null Value Data Imputation
# 4.6.1 Imputing Null Values in applicationDF
# Strategy for applicationDF:
# To impute null values in categorical variables which has lower null percentage, mode() is used to impute the most frequent items.
# To impute null values in categorical variables which has higher null percentage, a new category is created.
# To impute null values in numerical variables which has lower null percentage, median() is used as
# There are no outliers in the columns
# Mean returned decimal values and median returned whole numbers and the columns were number of requests
# checking the null value % of each column in applicationDF dataframe
round(bank_data.isnull().sum() / bank_data.shape[0] * 100.00,2)
SK_ID_CURR 0.00 TARGET 0.00 NAME_CONTRACT_TYPE 0.00 CODE_GENDER 0.00 FLAG_OWN_CAR 0.00 FLAG_OWN_REALTY 0.00 CNT_CHILDREN 0.00 AMT_INCOME_TOTAL 0.00 AMT_CREDIT 0.00 AMT_ANNUITY 0.00 AMT_GOODS_PRICE 0.09 NAME_TYPE_SUITE 0.42 NAME_INCOME_TYPE 0.00 NAME_EDUCATION_TYPE 0.00 NAME_FAMILY_STATUS 0.00 NAME_HOUSING_TYPE 0.00 REGION_POPULATION_RELATIVE 0.00 DAYS_BIRTH 0.00 DAYS_EMPLOYED 0.00 DAYS_REGISTRATION 0.00 DAYS_ID_PUBLISH 0.00 OCCUPATION_TYPE 31.35 CNT_FAM_MEMBERS 0.00 REGION_RATING_CLIENT 0.00 REGION_RATING_CLIENT_W_CITY 0.00 WEEKDAY_APPR_PROCESS_START 0.00 HOUR_APPR_PROCESS_START 0.00 REG_REGION_NOT_LIVE_REGION 0.00 REG_REGION_NOT_WORK_REGION 0.00 LIVE_REGION_NOT_WORK_REGION 0.00 REG_CITY_NOT_LIVE_CITY 0.00 REG_CITY_NOT_WORK_CITY 0.00 LIVE_CITY_NOT_WORK_CITY 0.00 ORGANIZATION_TYPE 0.00 OBS_30_CNT_SOCIAL_CIRCLE 0.33 DEF_30_CNT_SOCIAL_CIRCLE 0.33 OBS_60_CNT_SOCIAL_CIRCLE 0.33 DEF_60_CNT_SOCIAL_CIRCLE 0.33 DAYS_LAST_PHONE_CHANGE 0.00 FLAG_DOCUMENT_3 0.00 AMT_REQ_CREDIT_BUREAU_HOUR 13.50 AMT_REQ_CREDIT_BUREAU_DAY 13.50 AMT_REQ_CREDIT_BUREAU_WEEK 13.50 AMT_REQ_CREDIT_BUREAU_MON 13.50 AMT_REQ_CREDIT_BUREAU_QRT 13.50 AMT_REQ_CREDIT_BUREAU_YEAR 13.50 AMT_INCOME_RANGE 0.08 AMT_CREDIT_RANGE 0.00 AGE 0.00 AGE_GROUP 0.00 YEARS_EMPLOYED 0.00 EMPLOYMENT_YEAR 27.08 dtype: float64
Impute categorical variable 'NAME_TYPE_SUITE' which has lower null percentage(0.42%) with the most frequent category using mode()[ 0 ] :
bank_data['NAME_TYPE_SUITE'].describe()
count 306219 unique 7 top Unaccompanied freq 248526 Name: NAME_TYPE_SUITE, dtype: object
bank_data['NAME_TYPE_SUITE'].fillna((bank_data['NAME_TYPE_SUITE'].mode()[0]),inplace = True)
# Impute categorical variable 'OCCUPATION_TYPE' which has higher null percentage(31.35%) with a new category as assigning to any existing category might influence the analysis:
bank_data['OCCUPATION_TYPE'] = bank_data['OCCUPATION_TYPE'].cat.add_categories('Unknown')
bank_data['OCCUPATION_TYPE'].fillna('Unknown', inplace =True)
# Impute numerical variables with the median as there are no outliers that can be seen from results of describe() and mean() returns decimal values and these columns represent number of enquiries made which cannot be decimal:
bank_data[['AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_DAY',
'AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_MON',
'AMT_REQ_CREDIT_BUREAU_QRT','AMT_REQ_CREDIT_BUREAU_YEAR']].describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| AMT_REQ_CREDIT_BUREAU_HOUR | 265992.0 | 0.006402 | 0.083849 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 |
| AMT_REQ_CREDIT_BUREAU_DAY | 265992.0 | 0.007000 | 0.110757 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 |
| AMT_REQ_CREDIT_BUREAU_WEEK | 265992.0 | 0.034362 | 0.204685 | 0.0 | 0.0 | 0.0 | 0.0 | 8.0 |
| AMT_REQ_CREDIT_BUREAU_MON | 265992.0 | 0.267395 | 0.916002 | 0.0 | 0.0 | 0.0 | 0.0 | 27.0 |
| AMT_REQ_CREDIT_BUREAU_QRT | 265992.0 | 0.265474 | 0.794056 | 0.0 | 0.0 | 0.0 | 0.0 | 261.0 |
| AMT_REQ_CREDIT_BUREAU_YEAR | 265992.0 | 1.899974 | 1.869295 | 0.0 | 0.0 | 1.0 | 3.0 | 25.0 |
### Impute with median as mean has decimals and this is number of requests.
amount = ['AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_MON',
'AMT_REQ_CREDIT_BUREAU_QRT','AMT_REQ_CREDIT_BUREAU_YEAR']
for col in amount:
bank_data[col].fillna(bank_data[col].median(), inplace = True)
# checking the null value % of each column in previous_data dataframe
round(bank_data.isnull().sum() / previous_data.shape[0] * 100.00,2)
SK_ID_CURR 0.00 TARGET 0.00 NAME_CONTRACT_TYPE 0.00 CODE_GENDER 0.00 FLAG_OWN_CAR 0.00 FLAG_OWN_REALTY 0.00 CNT_CHILDREN 0.00 AMT_INCOME_TOTAL 0.00 AMT_CREDIT 0.00 AMT_ANNUITY 0.00 AMT_GOODS_PRICE 0.02 NAME_TYPE_SUITE 0.00 NAME_INCOME_TYPE 0.00 NAME_EDUCATION_TYPE 0.00 NAME_FAMILY_STATUS 0.00 NAME_HOUSING_TYPE 0.00 REGION_POPULATION_RELATIVE 0.00 DAYS_BIRTH 0.00 DAYS_EMPLOYED 0.00 DAYS_REGISTRATION 0.00 DAYS_ID_PUBLISH 0.00 OCCUPATION_TYPE 0.00 CNT_FAM_MEMBERS 0.00 REGION_RATING_CLIENT 0.00 REGION_RATING_CLIENT_W_CITY 0.00 WEEKDAY_APPR_PROCESS_START 0.00 HOUR_APPR_PROCESS_START 0.00 REG_REGION_NOT_LIVE_REGION 0.00 REG_REGION_NOT_WORK_REGION 0.00 LIVE_REGION_NOT_WORK_REGION 0.00 REG_CITY_NOT_LIVE_CITY 0.00 REG_CITY_NOT_WORK_CITY 0.00 LIVE_CITY_NOT_WORK_CITY 0.00 ORGANIZATION_TYPE 0.00 OBS_30_CNT_SOCIAL_CIRCLE 0.06 DEF_30_CNT_SOCIAL_CIRCLE 0.06 OBS_60_CNT_SOCIAL_CIRCLE 0.06 DEF_60_CNT_SOCIAL_CIRCLE 0.06 DAYS_LAST_PHONE_CHANGE 0.00 FLAG_DOCUMENT_3 0.00 AMT_REQ_CREDIT_BUREAU_HOUR 0.00 AMT_REQ_CREDIT_BUREAU_DAY 0.00 AMT_REQ_CREDIT_BUREAU_WEEK 0.00 AMT_REQ_CREDIT_BUREAU_MON 0.00 AMT_REQ_CREDIT_BUREAU_QRT 0.00 AMT_REQ_CREDIT_BUREAU_YEAR 0.00 AMT_INCOME_RANGE 0.01 AMT_CREDIT_RANGE 0.00 AGE 0.00 AGE_GROUP 0.00 YEARS_EMPLOYED 0.00 EMPLOYMENT_YEAR 4.99 dtype: float64
round(bank_data.isnull().sum() / bank_data.shape[0] * 100.00,2)
SK_ID_CURR 0.00 TARGET 0.00 NAME_CONTRACT_TYPE 0.00 CODE_GENDER 0.00 FLAG_OWN_CAR 0.00 FLAG_OWN_REALTY 0.00 CNT_CHILDREN 0.00 AMT_INCOME_TOTAL 0.00 AMT_CREDIT 0.00 AMT_ANNUITY 0.00 AMT_GOODS_PRICE 0.09 NAME_TYPE_SUITE 0.00 NAME_INCOME_TYPE 0.00 NAME_EDUCATION_TYPE 0.00 NAME_FAMILY_STATUS 0.00 NAME_HOUSING_TYPE 0.00 REGION_POPULATION_RELATIVE 0.00 DAYS_BIRTH 0.00 DAYS_EMPLOYED 0.00 DAYS_REGISTRATION 0.00 DAYS_ID_PUBLISH 0.00 OCCUPATION_TYPE 0.00 CNT_FAM_MEMBERS 0.00 REGION_RATING_CLIENT 0.00 REGION_RATING_CLIENT_W_CITY 0.00 WEEKDAY_APPR_PROCESS_START 0.00 HOUR_APPR_PROCESS_START 0.00 REG_REGION_NOT_LIVE_REGION 0.00 REG_REGION_NOT_WORK_REGION 0.00 LIVE_REGION_NOT_WORK_REGION 0.00 REG_CITY_NOT_LIVE_CITY 0.00 REG_CITY_NOT_WORK_CITY 0.00 LIVE_CITY_NOT_WORK_CITY 0.00 ORGANIZATION_TYPE 0.00 OBS_30_CNT_SOCIAL_CIRCLE 0.33 DEF_30_CNT_SOCIAL_CIRCLE 0.33 OBS_60_CNT_SOCIAL_CIRCLE 0.33 DEF_60_CNT_SOCIAL_CIRCLE 0.33 DAYS_LAST_PHONE_CHANGE 0.00 FLAG_DOCUMENT_3 0.00 AMT_REQ_CREDIT_BUREAU_HOUR 0.00 AMT_REQ_CREDIT_BUREAU_DAY 0.00 AMT_REQ_CREDIT_BUREAU_WEEK 0.00 AMT_REQ_CREDIT_BUREAU_MON 0.00 AMT_REQ_CREDIT_BUREAU_QRT 0.00 AMT_REQ_CREDIT_BUREAU_YEAR 0.00 AMT_INCOME_RANGE 0.08 AMT_CREDIT_RANGE 0.00 AGE 0.00 AGE_GROUP 0.00 YEARS_EMPLOYED 0.00 EMPLOYMENT_YEAR 27.08 dtype: float64
# Insight:
# We still have few null values in the columns:
# AMT_GOODS_PRICE, OBS_30_CNT_SOCIAL_CIRCLE, DEF_30_CNT_SOCIAL_CIRCLE, OBS_60_CNT_SOCIAL_CIRCLE, DEF_60_CNT_SOCIAL_CIRCLE.
# We can ignore as this percentage is very less.
## 4.6.2 Imputing Null Values in previous_data
# Strategy for bank_data:
# To impute null values in numerical column, we analysed the loan status and assigned values.
# To impute null values in continuous variables, we plotted the distribution of the columns and used
# median if the distribution is skewed
# mode if the distribution pattern is preserved.
# checking the null value % of each column in previousDF dataframe
round(previous_data.isnull().sum() / previous_data.shape[0] * 100.00,2)
SK_ID_PREV 0.00 SK_ID_CURR 0.00 NAME_CONTRACT_TYPE 0.00 AMT_ANNUITY 22.29 AMT_APPLICATION 0.00 AMT_CREDIT 0.00 AMT_DOWN_PAYMENT 53.64 AMT_GOODS_PRICE 23.08 RATE_DOWN_PAYMENT 53.64 RATE_INTEREST_PRIMARY 99.64 RATE_INTEREST_PRIVILEGED 99.64 NAME_CASH_LOAN_PURPOSE 0.00 NAME_CONTRACT_STATUS 0.00 DAYS_DECISION 0.00 NAME_PAYMENT_TYPE 0.00 CODE_REJECT_REASON 0.00 NAME_TYPE_SUITE 49.12 NAME_CLIENT_TYPE 0.00 NAME_GOODS_CATEGORY 0.00 NAME_PORTFOLIO 0.00 NAME_PRODUCT_TYPE 0.00 CHANNEL_TYPE 0.00 SELLERPLACE_AREA 0.00 NAME_SELLER_INDUSTRY 0.00 CNT_PAYMENT 22.29 NAME_YIELD_GROUP 0.00 PRODUCT_COMBINATION 0.02 DAYS_FIRST_DRAWING 40.30 DAYS_FIRST_DUE 40.30 DAYS_LAST_DUE_1ST_VERSION 40.30 DAYS_LAST_DUE 40.30 DAYS_TERMINATION 40.30 NFLAG_INSURED_ON_APPROVAL 40.30 DAYS_DECISION_GROUP 0.00 dtype: float64
###### Impute AMT_ANNUITY with median as the distribution is greatly skewed:
plt.figure(figsize=(6,6))
sns.kdeplot(previous_data['AMT_ANNUITY'])
plt.show()
# Insight:
# There is a single peak at the left side of the distribution and it indicates the presence of outliers and hence imputing with mean would not be the right approach and hence imputing with median
previous_data['AMT_ANNUITY'].fillna(previous_data['AMT_ANNUITY'].mean(), inplace = True)
## AMT_GOODS_PRICE with mode as the distribution is closely similar:
plt.figure(figsize =(4,4))
sns.kdeplot(previous_data['AMT_GOODS_PRICE'] ,shade = True, color = "red")
<Axes: xlabel='AMT_GOODS_PRICE', ylabel='Density'>
stat = pd.DataFrame()
stat['AMT_GOODS_PRICE_mode'] = previous_data['AMT_GOODS_PRICE'].fillna(previous_data['AMT_GOODS_PRICE'].mode()[0])
stat['AMT_GOODS_PRICE_median'] = previous_data['AMT_GOODS_PRICE'].fillna(previous_data['AMT_GOODS_PRICE'].median())
stat['AMT_GOODS_PRICE_mean'] = previous_data['AMT_GOODS_PRICE'].fillna(previous_data['AMT_GOODS_PRICE'].mean())
columns = ['AMT_GOODS_PRICE_mode', 'AMT_GOODS_PRICE_median','AMT_GOODS_PRICE_mean']
plt.figure(figsize=(18,10))
plt.suptitle('Distribution of Original data vs imputed data')
plt.subplot(221)
sns.distplot(previous_data['AMT_GOODS_PRICE'][pd.notnull(previous_data['AMT_GOODS_PRICE'])]);
for i in enumerate(columns):
plt.subplot(2,2,i[0]+2)
sns.distplot(stat[i[1]])
# Insight:
# The original distribution is closer with the distribution of data imputed with mode in this case
previous_data['AMT_GOODS_PRICE'].fillna(previous_data['AMT_GOODS_PRICE'].mode()[0], inplace = True)
## Impute CNT_PAYMENT with 0 as the NAME_CONTRACT_STATUS for these indicate that most of these loans were not started:
previous_data.loc[previous_data['CNT_PAYMENT'].isnull(),'NAME_CONTRACT_STATUS'].value_counts()
NAME_CONTRACT_STATUS Canceled 305805 Refused 40897 Unused offer 25524 Approved 4 Name: count, dtype: int64
previous_data['CNT_PAYMENT'].fillna(0,inplace = True)
round(previous_data.isnull().sum() / previous_data.shape[0] * 100.00,2)
SK_ID_PREV 0.00 SK_ID_CURR 0.00 NAME_CONTRACT_TYPE 0.00 AMT_ANNUITY 0.00 AMT_APPLICATION 0.00 AMT_CREDIT 0.00 AMT_DOWN_PAYMENT 53.64 AMT_GOODS_PRICE 0.00 RATE_DOWN_PAYMENT 53.64 RATE_INTEREST_PRIMARY 99.64 RATE_INTEREST_PRIVILEGED 99.64 NAME_CASH_LOAN_PURPOSE 0.00 NAME_CONTRACT_STATUS 0.00 DAYS_DECISION 0.00 NAME_PAYMENT_TYPE 0.00 CODE_REJECT_REASON 0.00 NAME_TYPE_SUITE 49.12 NAME_CLIENT_TYPE 0.00 NAME_GOODS_CATEGORY 0.00 NAME_PORTFOLIO 0.00 NAME_PRODUCT_TYPE 0.00 CHANNEL_TYPE 0.00 SELLERPLACE_AREA 0.00 NAME_SELLER_INDUSTRY 0.00 CNT_PAYMENT 0.00 NAME_YIELD_GROUP 0.00 PRODUCT_COMBINATION 0.02 DAYS_FIRST_DRAWING 40.30 DAYS_FIRST_DUE 40.30 DAYS_LAST_DUE_1ST_VERSION 40.30 DAYS_LAST_DUE 40.30 DAYS_TERMINATION 40.30 NFLAG_INSURED_ON_APPROVAL 40.30 DAYS_DECISION_GROUP 0.00 dtype: float64
### 4.7 Identifying the outliers
## Finding outlier information in applicationDF
plt.figure(figsize=(22,10))
application_outlier_col_1 = ['AMT_ANNUITY','AMT_INCOME_TOTAL','AMT_CREDIT','AMT_GOODS_PRICE','DAYS_EMPLOYED']
application_outlier_col_2 = ['CNT_CHILDREN','DAYS_BIRTH']
for i in enumerate(application_outlier_col_1):
plt.subplot(2,4,i[0]+1)
sns.boxplot(y = bank_data[i[1]])
for i in enumerate(application_outlier_col_2):
plt.subplot(2,4,i[0]+6)
sns.boxplot(y =bank_data[i[1]])
# Insight:
# It can be seen that in current application data
# AMT_ANNUITY, AMT_CREDIT, AMT_GOODS_PRICE,CNT_CHILDREN have some number of outliers.
# AMT_INCOME_TOTAL has huge number of outliers which indicate that few of the loan applicants have high income when compared to the others.
# DAYS_BIRTH has no outliers which means the data available is reliable.
# DAYS_EMPLOYED has outlier values around 350000(days) which is around 958 years which is impossible and hence this has to be incorrect entry.
# we can see the stats for these columns below as well.
bank_data[['AMT_ANNUITY', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_GOODS_PRICE', 'DAYS_BIRTH','CNT_CHILDREN','DAYS_EMPLOYED']].describe()
| AMT_ANNUITY | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_GOODS_PRICE | DAYS_BIRTH | CNT_CHILDREN | DAYS_EMPLOYED | |
|---|---|---|---|---|---|---|---|
| count | 307499.000000 | 307511.000000 | 307511.000000 | 3.072330e+05 | 307511.000000 | 307511.000000 | 307511.000000 |
| mean | 27108.573909 | 1.687979 | 5.990260 | 5.383962e+05 | 16036.995067 | 0.417052 | 67724.742149 |
| std | 14493.737315 | 2.371231 | 4.024908 | 3.694465e+05 | 4363.988632 | 0.722121 | 139443.751806 |
| min | 1615.500000 | 0.256500 | 0.450000 | 4.050000e+04 | 7489.000000 | 0.000000 | 0.000000 |
| 25% | 16524.000000 | 1.125000 | 2.700000 | 2.385000e+05 | 12413.000000 | 0.000000 | 933.000000 |
| 50% | 24903.000000 | 1.471500 | 5.135310 | 4.500000e+05 | 15750.000000 | 0.000000 | 2219.000000 |
| 75% | 34596.000000 | 2.025000 | 8.086500 | 6.795000e+05 | 19682.000000 | 1.000000 | 5707.000000 |
| max | 258025.500000 | 1170.000000 | 40.500000 | 4.050000e+06 | 25229.000000 | 19.000000 | 365243.000000 |
### Finding outlier information in previous_data
plt.figure(figsize=(22,8))
prev_outlier_col_1 = ['AMT_ANNUITY','AMT_APPLICATION','AMT_CREDIT','AMT_GOODS_PRICE','SELLERPLACE_AREA']
prev_outlier_col_2 = ['SK_ID_CURR','DAYS_DECISION','CNT_PAYMENT']
for i in enumerate(prev_outlier_col_1):
plt.subplot(2,4,i[0]+1)
sns.boxplot(y=previous_data[i[1]])
for i in enumerate(prev_outlier_col_2):
plt.subplot(2,4,i[0]+6)
sns.boxplot(y=previous_data[i[1]])
# Insight: It can be seen that in previous application data
# AMT_ANNUITY, AMT_APPLICATION, AMT_CREDIT, AMT_GOODS_PRICE, SELLERPLACE_AREA have huge number of outliers.
# CNT_PAYMENT has few outlier values.
# SK_ID_CURR is an ID column and hence no outliers.
# DAYS_DECISION has little number of outliers indicating that these previous applications decisions were taken long back.
# we can see the stats for these columns below as well.
previous_data[['AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_GOODS_PRICE', 'SELLERPLACE_AREA','CNT_PAYMENT','DAYS_DECISION']].describe()
| AMT_ANNUITY | AMT_APPLICATION | AMT_CREDIT | AMT_GOODS_PRICE | SELLERPLACE_AREA | CNT_PAYMENT | DAYS_DECISION | |
|---|---|---|---|---|---|---|---|
| count | 1.670214e+06 | 1.670214e+06 | 1.670213e+06 | 1.670214e+06 | 1.670214e+06 | 1.670214e+06 | 1.670214e+06 |
| mean | 1.595512e+04 | 1.752339e+05 | 1.961140e+05 | 1.856429e+05 | 3.139511e+02 | 1.247621e+01 | 8.806797e+02 |
| std | 1.303122e+04 | 2.927798e+05 | 3.185746e+05 | 2.871413e+05 | 7.127443e+03 | 1.447588e+01 | 7.790997e+02 |
| min | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | -1.000000e+00 | 0.000000e+00 | 1.000000e+00 |
| 25% | 7.547096e+03 | 1.872000e+04 | 2.416050e+04 | 4.500000e+04 | -1.000000e+00 | 0.000000e+00 | 2.800000e+02 |
| 50% | 1.544967e+04 | 7.104600e+04 | 8.054100e+04 | 7.105050e+04 | 3.000000e+00 | 1.000000e+01 | 5.810000e+02 |
| 75% | 1.682403e+04 | 1.803600e+05 | 2.164185e+05 | 1.804050e+05 | 8.200000e+01 | 1.600000e+01 | 1.300000e+03 |
| max | 4.180581e+05 | 6.905160e+06 | 6.905160e+06 | 6.905160e+06 | 4.000000e+06 | 8.400000e+01 | 2.922000e+03 |
# Strategy:
# The data analysis flow has been planned in following way :
# Imbalance in Data
# Categorical Data Analysis
# Categorical segmented Univariate Analysis
# Categorical Bi/Multivariate analysis
#### Numeric Data Analysis
# Bi-furcation of databased based on TARGET data
# Correlation Matrix
# Numerical segmented Univariate Analysis
# Numerical Bi/Multivariate analysis
Imbalance = bank_data["TARGET"].value_counts().reset_index()
Imbalance
| TARGET | count | |
|---|---|---|
| 0 | 0 | 282686 |
| 1 | 1 | 24825 |
plt.figure(figsize = (6,4))
x = ['Repayer','Defaulter']
sns.barplot( x=x, y="count", data = Imbalance)
<Axes: ylabel='count'>
count_0 = Imbalance.iloc[0]["TARGET"]
count_0
0
count_1 = Imbalance.iloc[1]["TARGET"]
count_1
1
count_0_perc = round(count_0/(count_0+count_1)*100,2)
count_0_perc
0.0
count_1_perc = round(count_1/(count_0+count_1)*100,2)
count_1_perc
100.0
print('Ratios of imbalance in percentage with respect to Repayer and Defaulter datas are: %.2f and %.2f'%(count_0_perc,count_1_perc))
print('Ratios of imbalance in relative with respect to Repayer and Defaulter datas is %.2f : 1 (approx)'%(count_0/count_1))
Ratios of imbalance in percentage with respect to Repayer and Defaulter datas are: 0.00 and 100.00 Ratios of imbalance in relative with respect to Repayer and Defaulter datas is 0.00 : 1 (approx)
############# 5.2 Plotting Functions
bank_data
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_3 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | AMT_INCOME_RANGE | AMT_CREDIT_RANGE | AGE | AGE_GROUP | YEARS_EMPLOYED | EMPLOYMENT_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 2.025 | 4.065975 | 24700.5 | 351000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.018801 | 9461 | 637 | 3648.0 | 2120 | Laborers | 1.0 | 2 | 2 | WEDNESDAY | 10 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | 2.0 | 2.0 | 2.0 | 2.0 | -1134.0 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 200k-300k | 400k-500k | 25 | 20-30 | 1 | 0-5 |
| 1 | 100003 | 0 | Cash loans | F | N | N | 0 | 2.700 | 12.935025 | 35698.5 | 1129500.0 | Family | State servant | Higher education | Married | House / apartment | 0.003541 | 16765 | 1188 | 1186.0 | 291 | Core staff | 2.0 | 1 | 1 | MONDAY | 11 | 0 | 0 | 0 | 0 | 0 | 0 | School | 1.0 | 0.0 | 1.0 | 0.0 | -828.0 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 200k-300k | 1M Above | 45 | 40-50 | 3 | 0-5 |
| 2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 0.675 | 1.350000 | 6750.0 | 135000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.010032 | 19046 | 225 | 4260.0 | 2531 | Laborers | 1.0 | 2 | 2 | MONDAY | 9 | 0 | 0 | 0 | 0 | 0 | 0 | Government | 0.0 | 0.0 | 0.0 | 0.0 | -815.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0-100K | 100K-200K | 52 | 50 above | 0 | NaN |
| 3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 1.350 | 3.126825 | 29686.5 | 297000.0 | Unaccompanied | Working | Secondary / secondary special | Civil marriage | House / apartment | 0.008019 | 19005 | 3039 | 9833.0 | 2437 | Laborers | 2.0 | 2 | 2 | WEDNESDAY | 17 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | 2.0 | 0.0 | 2.0 | 0.0 | -617.0 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 100K-200K | 300k-400k | 52 | 50 above | 8 | 5-10 |
| 4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 1.215 | 5.130000 | 21865.5 | 513000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.028663 | 19932 | 3038 | 4311.0 | 3458 | Core staff | 1.0 | 2 | 2 | THURSDAY | 11 | 0 | 0 | 0 | 0 | 1 | 1 | Religion | 0.0 | 0.0 | 0.0 | 0.0 | -1106.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 100K-200K | 500k-600k | 54 | 50 above | 8 | 5-10 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 307506 | 456251 | 0 | Cash loans | M | N | N | 0 | 1.575 | 2.547000 | 27558.0 | 225000.0 | Unaccompanied | Working | Secondary / secondary special | Separated | With parents | 0.032561 | 9327 | 236 | 8456.0 | 1982 | Sales staff | 1.0 | 1 | 1 | THURSDAY | 15 | 0 | 0 | 0 | 0 | 0 | 0 | Services | 0.0 | 0.0 | 0.0 | 0.0 | -273.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 100K-200K | 200k-300k | 25 | 20-30 | 0 | NaN |
| 307507 | 456252 | 0 | Cash loans | F | N | Y | 0 | 0.720 | 2.695500 | 12001.5 | 225000.0 | Unaccompanied | Pensioner | Secondary / secondary special | Widow | House / apartment | 0.025164 | 20775 | 365243 | 4388.0 | 4090 | Unknown | 1.0 | 2 | 2 | MONDAY | 8 | 0 | 0 | 0 | 0 | 0 | 0 | XNA | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0-100K | 200k-300k | 56 | 50 above | 1000 | NaN |
| 307508 | 456253 | 0 | Cash loans | F | N | Y | 0 | 1.530 | 6.776640 | 29979.0 | 585000.0 | Unaccompanied | Working | Higher education | Separated | House / apartment | 0.005002 | 14966 | 7921 | 6737.0 | 5150 | Managers | 1.0 | 3 | 3 | THURSDAY | 9 | 0 | 0 | 0 | 0 | 1 | 1 | School | 6.0 | 0.0 | 6.0 | 0.0 | -1909.0 | 1 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 100K-200K | 600k-700k | 41 | 40-50 | 21 | 20-30 |
| 307509 | 456254 | 1 | Cash loans | F | N | Y | 0 | 1.710 | 3.701070 | 20205.0 | 319500.0 | Unaccompanied | Commercial associate | Secondary / secondary special | Married | House / apartment | 0.005313 | 11961 | 4786 | 2562.0 | 931 | Laborers | 2.0 | 2 | 2 | WEDNESDAY | 9 | 0 | 0 | 0 | 1 | 1 | 0 | Business Entity Type 1 | 0.0 | 0.0 | 0.0 | 0.0 | -322.0 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 100K-200K | 300k-400k | 32 | 30-40 | 13 | 10-20 |
| 307510 | 456255 | 0 | Cash loans | F | N | N | 0 | 1.575 | 6.750000 | 49117.5 | 675000.0 | Unaccompanied | Commercial associate | Higher education | Married | House / apartment | 0.046220 | 16856 | 1262 | 5128.0 | 410 | Laborers | 2.0 | 1 | 1 | THURSDAY | 20 | 0 | 0 | 0 | 0 | 1 | 1 | Business Entity Type 3 | 0.0 | 0.0 | 0.0 | 0.0 | -787.0 | 1 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 | 100K-200K | 600k-700k | 46 | 40-50 | 3 | 0-5 |
307511 rows × 52 columns
# function for plotting repetitive countplots in univariate categorical analysis on applicationDF
# This function will create two subplots:
# 1. Count plot of categorical column w.r.t TARGET;
# 2. Percentage of defaulters within column.
def univariate_categorical(feature,ylog=False,label_rotation=False,horizontal_layout=True):
temp = bank_data[feature].value_counts()
df1 = pd.DataFrame({feature: temp.index,'Number of contracts': temp.values})
# Calculate the percentage of target=1 per category value
cat_perc = bank_data[[feature, 'TARGET']].groupby([feature],as_index=False).mean()
cat_perc["TARGET"] = cat_perc["TARGET"]*100
cat_perc.sort_values(by='TARGET', ascending=False, inplace=True)
if(horizontal_layout):
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))
else:
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(20,24))
# 1. Subplot 1: Count plot of categorical column
# sns.set_palette("Set2")
s = sns.countplot(ax=ax1,
x = feature,
data=bank_data,
hue ="TARGET",
order=cat_perc[feature],
palette=['g','r'])
# Define common styling
ax1.set_title(feature, fontdict={'fontsize' : 10, 'fontweight' : 3, 'color' : 'Blue'})
ax1.legend(['Repayer','Defaulter'])
# If the plot is not readable, use the log scale.
if ylog:
ax1.set_yscale('log')
ax1.set_ylabel("Count (log)",fontdict={'fontsize' : 10, 'fontweight' : 3, 'color' : 'Blue'})
if(label_rotation):
s.set_xticklabels(s.get_xticklabels(),rotation=90)
# 2. Subplot 2: Percentage of defaulters within the categorical column
s = sns.barplot(ax=ax2,
x = feature,
y='TARGET',
order=cat_perc[feature],
data=cat_perc,
palette='Set2')
if(label_rotation):
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.ylabel('Percent of Defaulters [%]', fontsize=10)
plt.tick_params(axis='both', which='major', labelsize=10)
ax2.set_title(feature + " Defaulter %", fontdict={'fontsize' : 15, 'fontweight' : 5, 'color' : 'Blue'})
plt.show();
def bivariate_bar(x,y,df,hue,figsize):
plt.figure(figsize=figsize)
sns.barplot(x=x,
y=y,
data=df,
hue=hue,
palette =['g','r'])
# Defining aesthetics of Labels and Title of the plot using style dictionaries
plt.xlabel(x,fontdict={'fontsize' : 10, 'fontweight' : 3, 'color' : 'Blue'})
plt.ylabel(y,fontdict={'fontsize' : 10, 'fontweight' : 3, 'color' : 'Blue'})
plt.title(col, fontdict={'fontsize' : 15, 'fontweight' : 5, 'color' : 'Blue'})
plt.xticks(rotation=90, ha='right')
plt.legend(labels = ['Repayer','Defaulter'])
plt.show()
def bivariate_rel(x,y,data, hue, kind, palette, legend,figsize):
plt.figure(figsize=figsize)
sns.relplot(x=x,
y=y,
data=bank_data,
hue="TARGET",
kind=kind,
palette = ['g','r'],
legend = False)
plt.legend(['Repayer','Defaulter'])
plt.xticks(rotation=90, ha='right')
plt.show()
def univariate_merged(col,df,hue,palette,ylog,figsize):
plt.figure(figsize=figsize)
ax=sns.countplot(x=col,
data=df,
hue= hue,
palette= palette,
order=df[col].value_counts().index)
if ylog:
plt.yscale('log')
plt.ylabel("Count (log)",fontdict={'fontsize' : 10, 'fontweight' : 3, 'color' : 'Blue'})
else:
plt.ylabel("Count",fontdict={'fontsize' : 10, 'fontweight' : 3, 'color' : 'Blue'})
plt.title(col , fontdict={'fontsize' : 15, 'fontweight' : 5, 'color' : 'Blue'})
plt.legend(loc = "upper right")
plt.xticks(rotation=90, ha='right')
plt.show()
def merged_pointplot(x,y):
plt.figure(figsize=(8,4))
sns.pointplot(x=x,
y=y,
hue="TARGET",
data=loan_process_df,
palette =['g','r'])
##### 5.3 Categorical Variables Analysis
### 5.3.1 Segmented Univariate Analysis
univariate_categorical('NAME_CONTRACT_TYPE',True)
univariate_categorical('CODE_GENDER')
univariate_categorical('FLAG_OWN_CAR')
univariate_categorical('FLAG_OWN_REALTY')
univariate_categorical("NAME_HOUSING_TYPE",True,True,True)
univariate_categorical("NAME_FAMILY_STATUS")
univariate_categorical("NAME_FAMILY_STATUS",False,True,True)
univariate_categorical("NAME_EDUCATION_TYPE",True,True,True)
univariate_categorical("NAME_INCOME_TYPE",True,True,False)
univariate_categorical("REGION_RATING_CLIENT",False,False,True)
univariate_categorical("OCCUPATION_TYPE",False,True,False)
univariate_categorical("ORGANIZATION_TYPE",True,True,False)
univariate_categorical("FLAG_DOCUMENT_3",False,False,True)
univariate_categorical("AGE_GROUP",False,False,True)
univariate_categorical("EMPLOYMENT_YEAR",False,False,True)
univariate_categorical("AMT_CREDIT_RANGE",False,False,False)
univariate_categorical("AMT_INCOME_RANGE",False,False,False)
univariate_categorical("CNT_CHILDREN",True)
univariate_categorical("CNT_FAM_MEMBERS",True, False, False)
#### 5.3.2 Categorical Bi/Multivariate Analysis
bank_data.groupby('NAME_INCOME_TYPE')['AMT_INCOME_TOTAL'].describe().T
| NAME_INCOME_TYPE | Businessman | Commercial associate | Maternity leave | Pensioner | State servant | Student | Unemployed | Working |
|---|---|---|---|---|---|---|---|---|
| count | 10.00000 | 71617.000000 | 5.000000 | 55362.000000 | 21703.000000 | 18.000000 | 22.000000 | 158774.000000 |
| mean | 6.52500 | 2.029553 | 1.404000 | 1.364013 | 1.797380 | 1.705000 | 1.105364 | 1.631699 |
| std | 6.27226 | 1.479742 | 1.268569 | 0.766503 | 1.008806 | 1.066447 | 0.880551 | 3.075777 |
| min | 1.80000 | 0.265500 | 0.495000 | 0.256500 | 0.270000 | 0.810000 | 0.265500 | 0.256500 |
| 25% | 2.25000 | 1.350000 | 0.675000 | 0.900000 | 1.125000 | 1.125000 | 0.540000 | 1.125000 |
| 50% | 4.95000 | 1.800000 | 0.900000 | 1.170000 | 1.575000 | 1.575000 | 0.787500 | 1.350000 |
| 75% | 8.43750 | 2.250000 | 1.350000 | 1.665000 | 2.250000 | 1.788750 | 1.350000 | 2.025000 |
| max | 22.50000 | 180.000900 | 3.600000 | 22.500000 | 31.500000 | 5.625000 | 3.375000 | 1170.000000 |
# Income type vs Income Amount Range
bivariate_bar("NAME_INCOME_TYPE","AMT_INCOME_TOTAL",bank_data,"TARGET",(18,10))
## Inferences:
## It can be seen that business man's income is the highest and the estimated range with default 95% confidence level seem to indicate that the income of a business man could be in the range of slightly close to 4 lakhs and slightly above 10 lakhs
### Numerical Univariate Analysis
Repayer_df = bank_data.loc[bank_data['TARGET']==0] # Repayers
Defaulter_df = bank_data.loc[bank_data['TARGET']==1] # Defaulters
amount = bank_data[[ 'AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY', 'AMT_GOODS_PRICE']]
fig = plt.figure(figsize=(16,12))
for i in enumerate(amount):
plt.subplot(2,2,i[0]+1)
sns.distplot(Defaulter_df[i[1]], hist=False, color='r',label ="Defaulter")
sns.distplot(Repayer_df[i[1]], hist=False, color='g', label ="Repayer")
plt.title(i[1], fontdict={'fontsize' : 15, 'fontweight' : 5, 'color' : 'Blue'})
plt.legend()
plt.show()
bivariate_rel('AMT_GOODS_PRICE','AMT_CREDIT',bank_data,"TARGET", "line", ['g','r'], False,(15,6))
<Figure size 1500x600 with 0 Axes>
amount = bank_data[[ 'AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY', 'AMT_GOODS_PRICE','TARGET']]
amount = amount[(amount["AMT_GOODS_PRICE"].notnull()) & (amount["AMT_ANNUITY"].notnull())]
ax= sns.pairplot(amount,hue="TARGET",palette=["g","r"])
ax.fig.legend(labels=['Repayer','Defaulter'])
plt.show()
### 6. Merged Dataframes Analysis
loan_process_df = pd.merge(bank_data, previous_data, how='inner', on='SK_ID_CURR')
loan_process_df.head()
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE_x | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT_x | AMT_ANNUITY_x | AMT_GOODS_PRICE_x | NAME_TYPE_SUITE_x | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_3 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | AMT_INCOME_RANGE | AMT_CREDIT_RANGE | AGE | AGE_GROUP | YEARS_EMPLOYED | EMPLOYMENT_YEAR | SK_ID_PREV | NAME_CONTRACT_TYPE_y | AMT_ANNUITY_y | AMT_APPLICATION | AMT_CREDIT_y | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE_y | RATE_DOWN_PAYMENT | RATE_INTEREST_PRIMARY | RATE_INTEREST_PRIVILEGED | NAME_CASH_LOAN_PURPOSE | NAME_CONTRACT_STATUS | DAYS_DECISION | NAME_PAYMENT_TYPE | CODE_REJECT_REASON | NAME_TYPE_SUITE_y | NAME_CLIENT_TYPE | NAME_GOODS_CATEGORY | NAME_PORTFOLIO | NAME_PRODUCT_TYPE | CHANNEL_TYPE | SELLERPLACE_AREA | NAME_SELLER_INDUSTRY | CNT_PAYMENT | NAME_YIELD_GROUP | PRODUCT_COMBINATION | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | NFLAG_INSURED_ON_APPROVAL | DAYS_DECISION_GROUP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 2.025 | 4.065975 | 24700.5 | 351000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.018801 | 9461 | 637 | 3648.0 | 2120 | Laborers | 1.0 | 2 | 2 | WEDNESDAY | 10 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | 2.0 | 2.0 | 2.0 | 2.0 | -1134.0 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 200k-300k | 400k-500k | 25 | 20-30 | 1 | 0-5 | 1038818 | Consumer loans | 9251.775 | 179055.0 | 179055.0 | 0.0 | 179055.0 | 0.000000 | NaN | NaN | XAP | Approved | 606 | XNA | XAP | NaN | New | Vehicles | POS | XNA | Stone | 500 | Auto technology | 24.0 | low_normal | POS other with interest | 365243.0 | -565.0 | 125.0 | -25.0 | -17.0 | 0.0 | 400-800 |
| 1 | 100003 | 0 | Cash loans | F | N | N | 0 | 2.700 | 12.935025 | 35698.5 | 1129500.0 | Family | State servant | Higher education | Married | House / apartment | 0.003541 | 16765 | 1188 | 1186.0 | 291 | Core staff | 2.0 | 1 | 1 | MONDAY | 11 | 0 | 0 | 0 | 0 | 0 | 0 | School | 1.0 | 0.0 | 1.0 | 0.0 | -828.0 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 200k-300k | 1M Above | 45 | 40-50 | 3 | 0-5 | 1810518 | Cash loans | 98356.995 | 900000.0 | 1035882.0 | NaN | 900000.0 | NaN | NaN | NaN | XNA | Approved | 746 | XNA | XAP | Unaccompanied | Repeater | XNA | Cash | x-sell | Credit and cash offices | -1 | XNA | 12.0 | low_normal | Cash X-Sell: low | 365243.0 | -716.0 | -386.0 | -536.0 | -527.0 | 1.0 | 400-800 |
| 2 | 100003 | 0 | Cash loans | F | N | N | 0 | 2.700 | 12.935025 | 35698.5 | 1129500.0 | Family | State servant | Higher education | Married | House / apartment | 0.003541 | 16765 | 1188 | 1186.0 | 291 | Core staff | 2.0 | 1 | 1 | MONDAY | 11 | 0 | 0 | 0 | 0 | 0 | 0 | School | 1.0 | 0.0 | 1.0 | 0.0 | -828.0 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 200k-300k | 1M Above | 45 | 40-50 | 3 | 0-5 | 2636178 | Consumer loans | 64567.665 | 337500.0 | 348637.5 | 0.0 | 337500.0 | 0.000000 | NaN | NaN | XAP | Approved | 828 | Cash through the bank | XAP | Family | Refreshed | Furniture | POS | XNA | Stone | 1400 | Furniture | 6.0 | middle | POS industry with interest | 365243.0 | -797.0 | -647.0 | -647.0 | -639.0 | 0.0 | 800-1200 |
| 3 | 100003 | 0 | Cash loans | F | N | N | 0 | 2.700 | 12.935025 | 35698.5 | 1129500.0 | Family | State servant | Higher education | Married | House / apartment | 0.003541 | 16765 | 1188 | 1186.0 | 291 | Core staff | 2.0 | 1 | 1 | MONDAY | 11 | 0 | 0 | 0 | 0 | 0 | 0 | School | 1.0 | 0.0 | 1.0 | 0.0 | -828.0 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 200k-300k | 1M Above | 45 | 40-50 | 3 | 0-5 | 2396755 | Consumer loans | 6737.310 | 68809.5 | 68053.5 | 6885.0 | 68809.5 | 0.100061 | NaN | NaN | XAP | Approved | 2341 | Cash through the bank | XAP | Family | Refreshed | Consumer Electronics | POS | XNA | Country-wide | 200 | Consumer electronics | 12.0 | middle | POS household with interest | 365243.0 | -2310.0 | -1980.0 | -1980.0 | -1976.0 | 1.0 | 2000-2400 |
| 4 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 0.675 | 1.350000 | 6750.0 | 135000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.010032 | 19046 | 225 | 4260.0 | 2531 | Laborers | 1.0 | 2 | 2 | MONDAY | 9 | 0 | 0 | 0 | 0 | 0 | 0 | Government | 0.0 | 0.0 | 0.0 | 0.0 | -815.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0-100K | 100K-200K | 52 | 50 above | 0 | NaN | 1564014 | Consumer loans | 5357.250 | 24282.0 | 20106.0 | 4860.0 | 24282.0 | 0.212008 | NaN | NaN | XAP | Approved | 815 | Cash through the bank | XAP | Unaccompanied | New | Mobile | POS | XNA | Regional / Local | 30 | Connectivity | 4.0 | middle | POS mobile without interest | 365243.0 | -784.0 | -694.0 | -724.0 | -714.0 | 0.0 | 800-1200 |
loan_process_df.shape
(1413701, 85)
loan_process_df.size
120164585
loan_process_df.describe()
| SK_ID_CURR | TARGET | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT_x | AMT_ANNUITY_x | AMT_GOODS_PRICE_x | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | CNT_FAM_MEMBERS | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_3 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | AGE | YEARS_EMPLOYED | SK_ID_PREV | AMT_ANNUITY_y | AMT_APPLICATION | AMT_CREDIT_y | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE_y | RATE_DOWN_PAYMENT | RATE_INTEREST_PRIMARY | RATE_INTEREST_PRIVILEGED | DAYS_DECISION | SELLERPLACE_AREA | CNT_PAYMENT | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | NFLAG_INSURED_ON_APPROVAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413608e+06 | 1.412493e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.410555e+06 | 1.410555e+06 | 1.410555e+06 | 1.410555e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 1.413700e+06 | 6.641610e+05 | 1.413701e+06 | 664161.000000 | 4791.000000 | 4791.000000 | 1.413701e+06 | 1.413701e+06 | 1.413701e+06 | 852595.000000 | 852595.000000 | 852595.000000 | 852595.000000 | 852595.000000 | 852595.000000 |
| mean | 2.784813e+05 | 8.655296e-02 | 4.048933e-01 | 1.733160e+00 | 5.875537e+00 | 2.701702e+04 | 5.277186e+05 | 2.074985e-02 | 1.632105e+04 | 7.266347e+04 | 5.003233e+03 | 3.034563e+03 | 2.150501e+00 | 1.198433e+01 | 1.207327e-02 | 1.544176e+00 | 1.540436e-01 | 1.526303e+00 | 1.080426e-01 | -1.084701e+03 | 7.385600e-01 | 5.484894e-03 | 6.028149e-03 | 3.410198e-02 | 2.664913e-01 | 3.196935e-01 | 2.691239e+00 | 4.421384e+01 | 1.985500e+02 | 1.922744e+06 | 1.586281e+04 | 1.752436e+05 | 1.963541e+05 | 6.655317e+03 | 1.854396e+05 | 0.080175 | 0.189436 | 0.771284 | 8.803670e+02 | 3.149878e+02 | 1.256367e+01 | 342257.656710 | 13488.741567 | 33274.831806 | 76665.634755 | 82353.171672 | 0.330572 |
| std | 1.028118e+05 | 2.811789e-01 | 7.173454e-01 | 1.985734e+00 | 3.849173e+00 | 1.395116e+04 | 3.532465e+05 | 1.334702e-02 | 4.344557e+03 | 1.433374e+05 | 3.551051e+03 | 1.507376e+03 | 9.006787e-01 | 3.232181e+00 | 1.092132e-01 | 2.530715e+00 | 4.658973e-01 | 2.508953e+00 | 3.790588e-01 | 7.999369e+02 | 4.394192e-01 | 7.702591e-02 | 1.001966e-01 | 2.012902e-01 | 9.268428e-01 | 8.781444e-01 | 2.157176e+00 | 1.190217e+01 | 3.926378e+02 | 5.327153e+05 | 1.302714e+04 | 2.936222e+05 | 3.194813e+05 | 2.062030e+04 | 2.881244e+05 | 0.107784 | 0.090849 | 0.100644 | 7.835402e+02 | 7.695082e+03 | 1.448807e+01 | 88832.266598 | 71650.147146 | 106161.775933 | 149704.716371 | 153537.064274 | 0.470419 |
| min | 1.000020e+05 | 0.000000e+00 | 0.000000e+00 | 2.565000e-01 | 4.500000e-01 | 1.615500e+03 | 4.050000e+04 | 2.900000e-04 | 7.489000e+03 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | -4.292000e+03 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2.000000e+01 | 0.000000e+00 | 1.000001e+06 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | -9.000000e-01 | 0.000000e+00 | -0.000015 | 0.034781 | 0.373150 | 1.000000e+00 | -1.000000e+00 | 0.000000e+00 | -2922.000000 | -2892.000000 | -2801.000000 | -2889.000000 | -2874.000000 | 0.000000 |
| 25% | 1.893640e+05 | 0.000000e+00 | 0.000000e+00 | 1.125000e+00 | 2.700000e+00 | 1.682100e+04 | 2.385000e+05 | 1.003200e-02 | 1.273900e+04 | 1.042000e+03 | 2.001000e+03 | 1.783000e+03 | 2.000000e+00 | 1.000000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | -1.683000e+03 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 3.400000e+01 | 2.000000e+00 | 1.461346e+06 | 7.406055e+03 | 1.975050e+04 | 2.488050e+04 | 0.000000e+00 | 4.500000e+04 | 0.000000 | 0.160716 | 0.715645 | 2.710000e+02 | -1.000000e+00 | 0.000000e+00 | 365243.000000 | -1630.000000 | -1244.000000 | -1316.000000 | -1269.000000 | 0.000000 |
| 50% | 2.789920e+05 | 0.000000e+00 | 0.000000e+00 | 1.575000e+00 | 5.084955e+00 | 2.492550e+04 | 4.500000e+05 | 1.885000e-02 | 1.604400e+04 | 2.401000e+03 | 4.508000e+03 | 3.330000e+03 | 2.000000e+00 | 1.200000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | -1.011000e+03 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2.000000e+00 | 4.300000e+01 | 6.000000e+00 | 1.922698e+06 | 1.507734e+04 | 7.087050e+04 | 8.059500e+04 | 1.791000e+03 | 7.087500e+04 | 0.062489 | 0.189136 | 0.835095 | 5.820000e+02 | 4.000000e+00 | 1.000000e+01 | 365243.000000 | -825.000000 | -358.000000 | -534.000000 | -494.000000 | 0.000000 |
| 75% | 3.675560e+05 | 0.000000e+00 | 1.000000e+00 | 2.070000e+00 | 8.079840e+00 | 3.454200e+04 | 6.795000e+05 | 2.866300e-02 | 1.998000e+04 | 6.313000e+03 | 7.510000e+03 | 4.319000e+03 | 3.000000e+00 | 1.400000e+01 | 0.000000e+00 | 2.000000e+00 | 0.000000e+00 | 2.000000e+00 | 0.000000e+00 | -3.960000e+02 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 4.000000e+00 | 5.400000e+01 | 1.700000e+01 | 2.384012e+06 | 1.674797e+04 | 1.800000e+05 | 2.156400e+05 | 7.695000e+03 | 1.800000e+05 | 0.108912 | 0.193330 | 0.852537 | 1.313000e+03 | 8.500000e+01 | 1.800000e+01 | 365243.000000 | -408.000000 | 135.000000 | -73.000000 | -42.000000 | 1.000000 |
| max | 4.562550e+05 | 1.000000e+00 | 1.900000e+01 | 1.170000e+03 | 4.050000e+01 | 2.250000e+05 | 4.050000e+06 | 7.250800e-02 | 2.520100e+04 | 3.652430e+05 | 2.467200e+04 | 7.197000e+03 | 2.000000e+01 | 2.300000e+01 | 1.000000e+00 | 3.480000e+02 | 3.400000e+01 | 3.440000e+02 | 2.400000e+01 | 0.000000e+00 | 1.000000e+00 | 4.000000e+00 | 9.000000e+00 | 8.000000e+00 | 2.700000e+01 | 2.610000e+02 | 2.500000e+01 | 6.900000e+01 | 1.000000e+03 | 2.845381e+06 | 4.180581e+05 | 5.850000e+06 | 4.509688e+06 | 3.060045e+06 | 5.850000e+06 | 1.000000 | 1.000000 | 1.000000 | 2.922000e+03 | 4.000000e+06 | 8.400000e+01 | 365243.000000 | 365243.000000 | 365243.000000 | 365243.000000 | 365243.000000 | 1.000000 |
# Bifurcating the applicationDF dataframe based on Target value 0 and 1 for correlation and other analysis
L0 = loan_process_df[loan_process_df['TARGET']==0] # Repayers
L1 = loan_process_df[loan_process_df['TARGET']==1] # Defaulters
### Plotting Contract Status vs purpose of the loan:
univariate_merged("NAME_CASH_LOAN_PURPOSE",L0,"NAME_CONTRACT_STATUS",["#548235","#FF0000","#0070C0","#FFFF00"],True,(18,7))
univariate_merged("NAME_CASH_LOAN_PURPOSE",L1,"NAME_CONTRACT_STATUS",["#548235","#FF0000","#0070C0","#FFFF00"],True,(18,7))
# Checking the Contract Status based on loan repayment status and whether there is any business loss or financial loss
univariate_merged("NAME_CONTRACT_STATUS",loan_process_df,"TARGET",['g','r'],False,(12,8))
g = loan_process_df.groupby("NAME_CONTRACT_STATUS")["TARGET"]
df1 = pd.concat([g.value_counts(),round(g.value_counts(normalize=True).mul(100),2)],axis=1, keys=('Counts','Percentage'))
df1['Percentage'] = df1['Percentage'].astype(str) +"%" # adding percentage symbol in the results for understanding
print (df1)
Counts Percentage
NAME_CONTRACT_STATUS TARGET
Approved 0 818856 92.41%
1 67243 7.59%
Canceled 0 235641 90.83%
1 23800 9.17%
Refused 0 215952 88.0%
1 29438 12.0%
Unused offer 0 20892 91.75%
1 1879 8.25%
# plotting the relationship between income total and contact status
merged_pointplot("NAME_CONTRACT_STATUS",'AMT_INCOME_TOTAL')
# plotting the relationship between people who defaulted in last 60 days being in client's social circle and contact status
merged_pointplot("NAME_CONTRACT_STATUS",'DEF_60_CNT_SOCIAL_CIRCLE')
### 7. Conclusions